diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..30954f4fb8537ec6a956d0a2447e78b3982b441a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+comfyui_controlnet_aux/examples/example_mesh_graphormer.png filter=lfs diff=lfs merge=lfs -text
+comfyui_controlnet_aux/examples/ExecuteAll.png filter=lfs diff=lfs merge=lfs -text
+comfyui_controlnet_aux/examples/ExecuteAll1.jpg filter=lfs diff=lfs merge=lfs -text
+comfyui_controlnet_aux/examples/ExecuteAll2.jpg filter=lfs diff=lfs merge=lfs -text
+comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/hand_landmarker.task filter=lfs diff=lfs merge=lfs -text
diff --git a/ComfyUI_InstantID/.github/FUNDING.yml b/ComfyUI_InstantID/.github/FUNDING.yml
new file mode 100644
index 0000000000000000000000000000000000000000..58b715a618cd9df97ee0d8e58e055b5aea9159cc
--- /dev/null
+++ b/ComfyUI_InstantID/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: cubiq
diff --git a/ComfyUI_InstantID/.github/workflows/publish.yml b/ComfyUI_InstantID/.github/workflows/publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ed74f76326434cd00098ec30d1d74c4c0a6e0533
--- /dev/null
+++ b/ComfyUI_InstantID/.github/workflows/publish.yml
@@ -0,0 +1,22 @@
+name: Publish to Comfy registry
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ - master
+ paths:
+ - "pyproject.toml"
+
+jobs:
+ publish-node:
+ name: Publish Custom Node to registry
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out code
+ uses: actions/checkout@v4
+ - name: Publish Custom Node
+ uses: Comfy-Org/publish-node-action@main
+ with:
+ ## Add your own personal access token to your Github Repository secrets and reference it here.
+ personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
diff --git a/ComfyUI_InstantID/.gitignore b/ComfyUI_InstantID/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..51e9d59e6732acf9330276920d77460367fd654a
--- /dev/null
+++ b/ComfyUI_InstantID/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/ComfyUI_InstantID/CrossAttentionPatch.py b/ComfyUI_InstantID/CrossAttentionPatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..44b36f634e4c873c5d711dd3356ffccb7e153140
--- /dev/null
+++ b/ComfyUI_InstantID/CrossAttentionPatch.py
@@ -0,0 +1,190 @@
+import torch
+import math
+import torch.nn.functional as F
+from comfy.ldm.modules.attention import optimized_attention
+from .utils import tensor_to_size
+
+class Attn2Replace:
+ def __init__(self, callback=None, **kwargs):
+ self.callback = [callback]
+ self.kwargs = [kwargs]
+
+ def add(self, callback, **kwargs):
+ self.callback.append(callback)
+ self.kwargs.append(kwargs)
+
+ for key, value in kwargs.items():
+ setattr(self, key, value)
+
+ def __call__(self, q, k, v, extra_options):
+ dtype = q.dtype
+ out = optimized_attention(q, k, v, extra_options["n_heads"])
+ sigma = extra_options["sigmas"].detach().cpu()[0].item() if 'sigmas' in extra_options else 999999999.9
+
+ for i, callback in enumerate(self.callback):
+ if sigma <= self.kwargs[i]["sigma_start"] and sigma >= self.kwargs[i]["sigma_end"]:
+ out = out + callback(out, q, k, v, extra_options, **self.kwargs[i])
+
+ return out.to(dtype=dtype)
+
+def instantid_attention(out, q, k, v, extra_options, module_key='', ipadapter=None, weight=1.0, cond=None, cond_alt=None, uncond=None, weight_type="linear", mask=None, sigma_start=0.0, sigma_end=1.0, unfold_batch=False, embeds_scaling='V only', **kwargs):
+ dtype = q.dtype
+ cond_or_uncond = extra_options["cond_or_uncond"]
+ block_type = extra_options["block"][0]
+ #block_id = extra_options["block"][1]
+ t_idx = extra_options["transformer_index"]
+ layers = 11 if '101_to_k_ip' in ipadapter.ip_layers.to_kvs else 16
+ k_key = module_key + "_to_k_ip"
+ v_key = module_key + "_to_v_ip"
+
+ # extra options for AnimateDiff
+ ad_params = extra_options['ad_params'] if "ad_params" in extra_options else None
+
+ b = q.shape[0]
+ seq_len = q.shape[1]
+ batch_prompt = b // len(cond_or_uncond)
+ _, _, oh, ow = extra_options["original_shape"]
+
+ if weight_type == 'ease in':
+ weight = weight * (0.05 + 0.95 * (1 - t_idx / layers))
+ elif weight_type == 'ease out':
+ weight = weight * (0.05 + 0.95 * (t_idx / layers))
+ elif weight_type == 'ease in-out':
+ weight = weight * (0.05 + 0.95 * (1 - abs(t_idx - (layers/2)) / (layers/2)))
+ elif weight_type == 'reverse in-out':
+ weight = weight * (0.05 + 0.95 * (abs(t_idx - (layers/2)) / (layers/2)))
+ elif weight_type == 'weak input' and block_type == 'input':
+ weight = weight * 0.2
+ elif weight_type == 'weak middle' and block_type == 'middle':
+ weight = weight * 0.2
+ elif weight_type == 'weak output' and block_type == 'output':
+ weight = weight * 0.2
+ elif weight_type == 'strong middle' and (block_type == 'input' or block_type == 'output'):
+ weight = weight * 0.2
+ elif isinstance(weight, dict):
+ if t_idx not in weight:
+ return 0
+
+ weight = weight[t_idx]
+
+ if cond_alt is not None and t_idx in cond_alt:
+ cond = cond_alt[t_idx]
+ del cond_alt
+
+ if unfold_batch:
+ # Check AnimateDiff context window
+ if ad_params is not None and ad_params["sub_idxs"] is not None:
+ if isinstance(weight, torch.Tensor):
+ weight = tensor_to_size(weight, ad_params["full_length"])
+ weight = torch.Tensor(weight[ad_params["sub_idxs"]])
+ if torch.all(weight == 0):
+ return 0
+ weight = weight.repeat(len(cond_or_uncond), 1, 1) # repeat for cond and uncond
+ elif weight == 0:
+ return 0
+
+ # if image length matches or exceeds full_length get sub_idx images
+ if cond.shape[0] >= ad_params["full_length"]:
+ cond = torch.Tensor(cond[ad_params["sub_idxs"]])
+ uncond = torch.Tensor(uncond[ad_params["sub_idxs"]])
+ # otherwise get sub_idxs images
+ else:
+ cond = tensor_to_size(cond, ad_params["full_length"])
+ uncond = tensor_to_size(uncond, ad_params["full_length"])
+ cond = cond[ad_params["sub_idxs"]]
+ uncond = uncond[ad_params["sub_idxs"]]
+ else:
+ if isinstance(weight, torch.Tensor):
+ weight = tensor_to_size(weight, batch_prompt)
+ if torch.all(weight == 0):
+ return 0
+ weight = weight.repeat(len(cond_or_uncond), 1, 1) # repeat for cond and uncond
+ elif weight == 0:
+ return 0
+
+ cond = tensor_to_size(cond, batch_prompt)
+ uncond = tensor_to_size(uncond, batch_prompt)
+
+ k_cond = ipadapter.ip_layers.to_kvs[k_key](cond)
+ k_uncond = ipadapter.ip_layers.to_kvs[k_key](uncond)
+ v_cond = ipadapter.ip_layers.to_kvs[v_key](cond)
+ v_uncond = ipadapter.ip_layers.to_kvs[v_key](uncond)
+ else:
+ # TODO: should we always convert the weights to a tensor?
+ if isinstance(weight, torch.Tensor):
+ weight = tensor_to_size(weight, batch_prompt)
+ if torch.all(weight == 0):
+ return 0
+ weight = weight.repeat(len(cond_or_uncond), 1, 1) # repeat for cond and uncond
+ elif weight == 0:
+ return 0
+
+ k_cond = ipadapter.ip_layers.to_kvs[k_key](cond).repeat(batch_prompt, 1, 1)
+ k_uncond = ipadapter.ip_layers.to_kvs[k_key](uncond).repeat(batch_prompt, 1, 1)
+ v_cond = ipadapter.ip_layers.to_kvs[v_key](cond).repeat(batch_prompt, 1, 1)
+ v_uncond = ipadapter.ip_layers.to_kvs[v_key](uncond).repeat(batch_prompt, 1, 1)
+
+ ip_k = torch.cat([(k_cond, k_uncond)[i] for i in cond_or_uncond], dim=0)
+ ip_v = torch.cat([(v_cond, v_uncond)[i] for i in cond_or_uncond], dim=0)
+
+ if embeds_scaling == 'K+mean(V) w/ C penalty':
+ scaling = float(ip_k.shape[2]) / 1280.0
+ weight = weight * scaling
+ ip_k = ip_k * weight
+ ip_v_mean = torch.mean(ip_v, dim=1, keepdim=True)
+ ip_v = (ip_v - ip_v_mean) + ip_v_mean * weight
+ out_ip = optimized_attention(q, ip_k, ip_v, extra_options["n_heads"])
+ del ip_v_mean
+ elif embeds_scaling == 'K+V w/ C penalty':
+ scaling = float(ip_k.shape[2]) / 1280.0
+ weight = weight * scaling
+ ip_k = ip_k * weight
+ ip_v = ip_v * weight
+ out_ip = optimized_attention(q, ip_k, ip_v, extra_options["n_heads"])
+ elif embeds_scaling == 'K+V':
+ ip_k = ip_k * weight
+ ip_v = ip_v * weight
+ out_ip = optimized_attention(q, ip_k, ip_v, extra_options["n_heads"])
+ else:
+ #ip_v = ip_v * weight
+ out_ip = optimized_attention(q, ip_k, ip_v, extra_options["n_heads"])
+ out_ip = out_ip * weight # I'm doing this to get the same results as before
+
+ if mask is not None:
+ mask_h = oh / math.sqrt(oh * ow / seq_len)
+ mask_h = int(mask_h) + int((seq_len % int(mask_h)) != 0)
+ mask_w = seq_len // mask_h
+
+ # check if using AnimateDiff and sliding context window
+ if (mask.shape[0] > 1 and ad_params is not None and ad_params["sub_idxs"] is not None):
+ # if mask length matches or exceeds full_length, get sub_idx masks
+ if mask.shape[0] >= ad_params["full_length"]:
+ mask = torch.Tensor(mask[ad_params["sub_idxs"]])
+ mask = F.interpolate(mask.unsqueeze(1), size=(mask_h, mask_w), mode="bilinear").squeeze(1)
+ else:
+ mask = F.interpolate(mask.unsqueeze(1), size=(mask_h, mask_w), mode="bilinear").squeeze(1)
+ mask = tensor_to_size(mask, ad_params["full_length"])
+ mask = mask[ad_params["sub_idxs"]]
+ else:
+ mask = F.interpolate(mask.unsqueeze(1), size=(mask_h, mask_w), mode="bilinear").squeeze(1)
+ mask = tensor_to_size(mask, batch_prompt)
+
+ mask = mask.repeat(len(cond_or_uncond), 1, 1)
+ mask = mask.view(mask.shape[0], -1, 1).repeat(1, 1, out.shape[2])
+
+ # covers cases where extreme aspect ratios can cause the mask to have a wrong size
+ mask_len = mask_h * mask_w
+ if mask_len < seq_len:
+ pad_len = seq_len - mask_len
+ pad1 = pad_len // 2
+ pad2 = pad_len - pad1
+ mask = F.pad(mask, (0, 0, pad1, pad2), value=0.0)
+ elif mask_len > seq_len:
+ crop_start = (mask_len - seq_len) // 2
+ mask = mask[:, crop_start:crop_start+seq_len, :]
+
+ out_ip = out_ip * mask
+
+ #out = out + out_ip
+
+ return out_ip.to(dtype=dtype)
diff --git a/ComfyUI_InstantID/InstantID.py b/ComfyUI_InstantID/InstantID.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e9bca73d59600c8af968076734affcfb57e0ad0
--- /dev/null
+++ b/ComfyUI_InstantID/InstantID.py
@@ -0,0 +1,611 @@
+import torch
+import os
+import comfy.utils
+import folder_paths
+import numpy as np
+import math
+import cv2
+import PIL.Image
+from .resampler import Resampler
+from .CrossAttentionPatch import Attn2Replace, instantid_attention
+from .utils import tensor_to_image
+
+from insightface.app import FaceAnalysis
+
+try:
+ import torchvision.transforms.v2 as T
+except ImportError:
+ import torchvision.transforms as T
+
+import torch.nn.functional as F
+
+MODELS_DIR = os.path.join(folder_paths.models_dir, "instantid")
+if "instantid" not in folder_paths.folder_names_and_paths:
+ current_paths = [MODELS_DIR]
+else:
+ current_paths, _ = folder_paths.folder_names_and_paths["instantid"]
+folder_paths.folder_names_and_paths["instantid"] = (current_paths, folder_paths.supported_pt_extensions)
+
+INSIGHTFACE_DIR = os.path.join(folder_paths.models_dir, "insightface")
+
+def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]):
+ stickwidth = 4
+ limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
+ kps = np.array(kps)
+
+ h, w, _ = image_pil.shape
+ out_img = np.zeros([h, w, 3])
+
+ for i in range(len(limbSeq)):
+ index = limbSeq[i]
+ color = color_list[index[0]]
+
+ x = kps[index][:, 0]
+ y = kps[index][:, 1]
+ length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
+ angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+ polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+ out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
+ out_img = (out_img * 0.6).astype(np.uint8)
+
+ for idx_kp, kp in enumerate(kps):
+ color = color_list[idx_kp]
+ x, y = kp
+ out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
+
+ out_img_pil = PIL.Image.fromarray(out_img.astype(np.uint8))
+ return out_img_pil
+
+class InstantID(torch.nn.Module):
+ def __init__(self, instantid_model, cross_attention_dim=1280, output_cross_attention_dim=1024, clip_embeddings_dim=512, clip_extra_context_tokens=16):
+ super().__init__()
+
+ self.clip_embeddings_dim = clip_embeddings_dim
+ self.cross_attention_dim = cross_attention_dim
+ self.output_cross_attention_dim = output_cross_attention_dim
+ self.clip_extra_context_tokens = clip_extra_context_tokens
+
+ self.image_proj_model = self.init_proj()
+
+ self.image_proj_model.load_state_dict(instantid_model["image_proj"])
+ self.ip_layers = To_KV(instantid_model["ip_adapter"])
+
+ def init_proj(self):
+ image_proj_model = Resampler(
+ dim=self.cross_attention_dim,
+ depth=4,
+ dim_head=64,
+ heads=20,
+ num_queries=self.clip_extra_context_tokens,
+ embedding_dim=self.clip_embeddings_dim,
+ output_dim=self.output_cross_attention_dim,
+ ff_mult=4
+ )
+ return image_proj_model
+
+ @torch.inference_mode()
+ def get_image_embeds(self, clip_embed, clip_embed_zeroed):
+ #image_prompt_embeds = clip_embed.clone().detach()
+ image_prompt_embeds = self.image_proj_model(clip_embed)
+ #uncond_image_prompt_embeds = clip_embed_zeroed.clone().detach()
+ uncond_image_prompt_embeds = self.image_proj_model(clip_embed_zeroed)
+
+ return image_prompt_embeds, uncond_image_prompt_embeds
+
+class ImageProjModel(torch.nn.Module):
+ def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+ super().__init__()
+
+ self.cross_attention_dim = cross_attention_dim
+ self.clip_extra_context_tokens = clip_extra_context_tokens
+ self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+ self.norm = torch.nn.LayerNorm(cross_attention_dim)
+
+ def forward(self, image_embeds):
+ embeds = image_embeds
+ clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+ clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+ return clip_extra_context_tokens
+
+class To_KV(torch.nn.Module):
+ def __init__(self, state_dict):
+ super().__init__()
+
+ self.to_kvs = torch.nn.ModuleDict()
+ for key, value in state_dict.items():
+ k = key.replace(".weight", "").replace(".", "_")
+ self.to_kvs[k] = torch.nn.Linear(value.shape[1], value.shape[0], bias=False)
+ self.to_kvs[k].weight.data = value
+
+def _set_model_patch_replace(model, patch_kwargs, key):
+ to = model.model_options["transformer_options"].copy()
+ if "patches_replace" not in to:
+ to["patches_replace"] = {}
+ else:
+ to["patches_replace"] = to["patches_replace"].copy()
+
+ if "attn2" not in to["patches_replace"]:
+ to["patches_replace"]["attn2"] = {}
+ else:
+ to["patches_replace"]["attn2"] = to["patches_replace"]["attn2"].copy()
+
+ if key not in to["patches_replace"]["attn2"]:
+ to["patches_replace"]["attn2"][key] = Attn2Replace(instantid_attention, **patch_kwargs)
+ model.model_options["transformer_options"] = to
+ else:
+ to["patches_replace"]["attn2"][key].add(instantid_attention, **patch_kwargs)
+
+class InstantIDModelLoader:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": { "instantid_file": (folder_paths.get_filename_list("instantid"), )}}
+
+ RETURN_TYPES = ("INSTANTID",)
+ FUNCTION = "load_model"
+ CATEGORY = "InstantID"
+
+ def load_model(self, instantid_file):
+ ckpt_path = folder_paths.get_full_path("instantid", instantid_file)
+
+ model = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
+
+ if ckpt_path.lower().endswith(".safetensors"):
+ st_model = {"image_proj": {}, "ip_adapter": {}}
+ for key in model.keys():
+ if key.startswith("image_proj."):
+ st_model["image_proj"][key.replace("image_proj.", "")] = model[key]
+ elif key.startswith("ip_adapter."):
+ st_model["ip_adapter"][key.replace("ip_adapter.", "")] = model[key]
+ model = st_model
+
+ model = InstantID(
+ model,
+ cross_attention_dim=1280,
+ output_cross_attention_dim=model["ip_adapter"]["1.to_k_ip.weight"].shape[1],
+ clip_embeddings_dim=512,
+ clip_extra_context_tokens=16,
+ )
+
+ return (model,)
+
+def extractFeatures(insightface, image, extract_kps=False):
+ face_img = tensor_to_image(image)
+ out = []
+
+ insightface.det_model.input_size = (640,640) # reset the detection size
+
+ for i in range(face_img.shape[0]):
+ for size in [(size, size) for size in range(640, 128, -64)]:
+ insightface.det_model.input_size = size # TODO: hacky but seems to be working
+ face = insightface.get(face_img[i])
+ if face:
+ face = sorted(face, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]
+
+ if extract_kps:
+ out.append(draw_kps(face_img[i], face['kps']))
+ else:
+ out.append(torch.from_numpy(face['embedding']).unsqueeze(0))
+
+ if 640 not in size:
+ print(f"\033[33mINFO: InsightFace detection resolution lowered to {size}.\033[0m")
+ break
+
+ if out:
+ if extract_kps:
+ out = torch.stack(T.ToTensor()(out), dim=0).permute([0,2,3,1])
+ else:
+ out = torch.stack(out, dim=0)
+ else:
+ out = None
+
+ return out
+
+class InstantIDFaceAnalysis:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "provider": (["CPU", "CUDA", "ROCM", "CoreML"], ),
+ },
+ }
+
+ RETURN_TYPES = ("FACEANALYSIS",)
+ FUNCTION = "load_insight_face"
+ CATEGORY = "InstantID"
+
+ def load_insight_face(self, provider):
+ model = FaceAnalysis(name="antelopev2", root=INSIGHTFACE_DIR, providers=[provider + 'ExecutionProvider',]) # alternative to buffalo_l
+ model.prepare(ctx_id=0, det_size=(640, 640))
+
+ return (model,)
+
+class FaceKeypointsPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "faceanalysis": ("FACEANALYSIS", ),
+ "image": ("IMAGE", ),
+ },
+ }
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "preprocess_image"
+ CATEGORY = "InstantID"
+
+ def preprocess_image(self, faceanalysis, image):
+ face_kps = extractFeatures(faceanalysis, image, extract_kps=True)
+
+ if face_kps is None:
+ face_kps = torch.zeros_like(image)
+ print(f"\033[33mWARNING: no face detected, unable to extract the keypoints!\033[0m")
+ #raise Exception('Face Keypoints Image: No face detected.')
+
+ return (face_kps,)
+
+def add_noise(image, factor):
+ seed = int(torch.sum(image).item()) % 1000000007
+ torch.manual_seed(seed)
+ mask = (torch.rand_like(image) < factor).float()
+ noise = torch.rand_like(image)
+ noise = torch.zeros_like(image) * (1-mask) + noise * mask
+
+ return factor*noise
+
+class ApplyInstantID:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "instantid": ("INSTANTID", ),
+ "insightface": ("FACEANALYSIS", ),
+ "control_net": ("CONTROL_NET", ),
+ "image": ("IMAGE", ),
+ "model": ("MODEL", ),
+ "positive": ("CONDITIONING", ),
+ "negative": ("CONDITIONING", ),
+ "weight": ("FLOAT", {"default": .8, "min": 0.0, "max": 5.0, "step": 0.01, }),
+ "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ },
+ "optional": {
+ "image_kps": ("IMAGE",),
+ "mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("MODEL", "CONDITIONING", "CONDITIONING",)
+ RETURN_NAMES = ("MODEL", "positive", "negative", )
+ FUNCTION = "apply_instantid"
+ CATEGORY = "InstantID"
+
+ def apply_instantid(self, instantid, insightface, control_net, image, model, positive, negative, start_at, end_at, weight=.8, ip_weight=None, cn_strength=None, noise=0.35, image_kps=None, mask=None, combine_embeds='average'):
+ dtype = comfy.model_management.unet_dtype()
+ if dtype not in [torch.float32, torch.float16, torch.bfloat16]:
+ dtype = torch.float16 if comfy.model_management.should_use_fp16() else torch.float32
+
+ self.dtype = dtype
+ self.device = comfy.model_management.get_torch_device()
+
+ ip_weight = weight if ip_weight is None else ip_weight
+ cn_strength = weight if cn_strength is None else cn_strength
+
+ face_embed = extractFeatures(insightface, image)
+ if face_embed is None:
+ raise Exception('Reference Image: No face detected.')
+
+ # if no keypoints image is provided, use the image itself (only the first one in the batch)
+ face_kps = extractFeatures(insightface, image_kps if image_kps is not None else image[0].unsqueeze(0), extract_kps=True)
+
+ if face_kps is None:
+ face_kps = torch.zeros_like(image) if image_kps is None else image_kps
+ print(f"\033[33mWARNING: No face detected in the keypoints image!\033[0m")
+
+ clip_embed = face_embed
+ # InstantID works better with averaged embeds (TODO: needs testing)
+ if clip_embed.shape[0] > 1:
+ if combine_embeds == 'average':
+ clip_embed = torch.mean(clip_embed, dim=0).unsqueeze(0)
+ elif combine_embeds == 'norm average':
+ clip_embed = torch.mean(clip_embed / torch.norm(clip_embed, dim=0, keepdim=True), dim=0).unsqueeze(0)
+
+ if noise > 0:
+ seed = int(torch.sum(clip_embed).item()) % 1000000007
+ torch.manual_seed(seed)
+ clip_embed_zeroed = noise * torch.rand_like(clip_embed)
+ #clip_embed_zeroed = add_noise(clip_embed, noise)
+ else:
+ clip_embed_zeroed = torch.zeros_like(clip_embed)
+
+ # 1: patch the attention
+ self.instantid = instantid
+ self.instantid.to(self.device, dtype=self.dtype)
+
+ image_prompt_embeds, uncond_image_prompt_embeds = self.instantid.get_image_embeds(clip_embed.to(self.device, dtype=self.dtype), clip_embed_zeroed.to(self.device, dtype=self.dtype))
+
+ image_prompt_embeds = image_prompt_embeds.to(self.device, dtype=self.dtype)
+ uncond_image_prompt_embeds = uncond_image_prompt_embeds.to(self.device, dtype=self.dtype)
+
+ work_model = model.clone()
+
+ sigma_start = model.get_model_object("model_sampling").percent_to_sigma(start_at)
+ sigma_end = model.get_model_object("model_sampling").percent_to_sigma(end_at)
+
+ if mask is not None:
+ mask = mask.to(self.device)
+
+ patch_kwargs = {
+ "ipadapter": self.instantid,
+ "weight": ip_weight,
+ "cond": image_prompt_embeds,
+ "uncond": uncond_image_prompt_embeds,
+ "mask": mask,
+ "sigma_start": sigma_start,
+ "sigma_end": sigma_end,
+ }
+
+ number = 0
+ for id in [4,5,7,8]: # id of input_blocks that have cross attention
+ block_indices = range(2) if id in [4, 5] else range(10) # transformer_depth
+ for index in block_indices:
+ patch_kwargs["module_key"] = str(number*2+1)
+ _set_model_patch_replace(work_model, patch_kwargs, ("input", id, index))
+ number += 1
+ for id in range(6): # id of output_blocks that have cross attention
+ block_indices = range(2) if id in [3, 4, 5] else range(10) # transformer_depth
+ for index in block_indices:
+ patch_kwargs["module_key"] = str(number*2+1)
+ _set_model_patch_replace(work_model, patch_kwargs, ("output", id, index))
+ number += 1
+ for index in range(10):
+ patch_kwargs["module_key"] = str(number*2+1)
+ _set_model_patch_replace(work_model, patch_kwargs, ("middle", 1, index))
+ number += 1
+
+ # 2: do the ControlNet
+ if mask is not None and len(mask.shape) < 3:
+ mask = mask.unsqueeze(0)
+
+ cnets = {}
+ cond_uncond = []
+
+ is_cond = True
+ for conditioning in [positive, negative]:
+ c = []
+ for t in conditioning:
+ d = t[1].copy()
+
+ prev_cnet = d.get('control', None)
+ if prev_cnet in cnets:
+ c_net = cnets[prev_cnet]
+ else:
+ c_net = control_net.copy().set_cond_hint(face_kps.movedim(-1,1), cn_strength, (start_at, end_at))
+ c_net.set_previous_controlnet(prev_cnet)
+ cnets[prev_cnet] = c_net
+
+ d['control'] = c_net
+ d['control_apply_to_uncond'] = False
+ d['cross_attn_controlnet'] = image_prompt_embeds.to(comfy.model_management.intermediate_device(), dtype=c_net.cond_hint_original.dtype) if is_cond else uncond_image_prompt_embeds.to(comfy.model_management.intermediate_device(), dtype=c_net.cond_hint_original.dtype)
+
+ if mask is not None and is_cond:
+ d['mask'] = mask
+ d['set_area_to_bounds'] = False
+
+ n = [t[0], d]
+ c.append(n)
+ cond_uncond.append(c)
+ is_cond = False
+
+ return(work_model, cond_uncond[0], cond_uncond[1], )
+
+class ApplyInstantIDAdvanced(ApplyInstantID):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "instantid": ("INSTANTID", ),
+ "insightface": ("FACEANALYSIS", ),
+ "control_net": ("CONTROL_NET", ),
+ "image": ("IMAGE", ),
+ "model": ("MODEL", ),
+ "positive": ("CONDITIONING", ),
+ "negative": ("CONDITIONING", ),
+ "ip_weight": ("FLOAT", {"default": .8, "min": 0.0, "max": 3.0, "step": 0.01, }),
+ "cn_strength": ("FLOAT", {"default": .8, "min": 0.0, "max": 10.0, "step": 0.01, }),
+ "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ "noise": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.1, }),
+ "combine_embeds": (['average', 'norm average', 'concat'], {"default": 'average'}),
+ },
+ "optional": {
+ "image_kps": ("IMAGE",),
+ "mask": ("MASK",),
+ }
+ }
+
+class InstantIDAttentionPatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "instantid": ("INSTANTID", ),
+ "insightface": ("FACEANALYSIS", ),
+ "image": ("IMAGE", ),
+ "model": ("MODEL", ),
+ "weight": ("FLOAT", {"default": 1.0, "min": -1.0, "max": 3.0, "step": 0.01, }),
+ "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ "noise": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.1, }),
+ },
+ "optional": {
+ "mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("MODEL", "FACE_EMBEDS")
+ FUNCTION = "patch_attention"
+ CATEGORY = "InstantID"
+
+ def patch_attention(self, instantid, insightface, image, model, weight, start_at, end_at, noise=0.0, mask=None):
+ self.dtype = torch.float16 if comfy.model_management.should_use_fp16() else torch.float32
+ self.device = comfy.model_management.get_torch_device()
+
+ face_embed = extractFeatures(insightface, image)
+ if face_embed is None:
+ raise Exception('Reference Image: No face detected.')
+
+ clip_embed = face_embed
+ # InstantID works better with averaged embeds (TODO: needs testing)
+ if clip_embed.shape[0] > 1:
+ clip_embed = torch.mean(clip_embed, dim=0).unsqueeze(0)
+
+ if noise > 0:
+ seed = int(torch.sum(clip_embed).item()) % 1000000007
+ torch.manual_seed(seed)
+ clip_embed_zeroed = noise * torch.rand_like(clip_embed)
+ else:
+ clip_embed_zeroed = torch.zeros_like(clip_embed)
+
+ # 1: patch the attention
+ self.instantid = instantid
+ self.instantid.to(self.device, dtype=self.dtype)
+
+ image_prompt_embeds, uncond_image_prompt_embeds = self.instantid.get_image_embeds(clip_embed.to(self.device, dtype=self.dtype), clip_embed_zeroed.to(self.device, dtype=self.dtype))
+
+ image_prompt_embeds = image_prompt_embeds.to(self.device, dtype=self.dtype)
+ uncond_image_prompt_embeds = uncond_image_prompt_embeds.to(self.device, dtype=self.dtype)
+
+ if weight == 0:
+ return (model, { "cond": image_prompt_embeds, "uncond": uncond_image_prompt_embeds } )
+
+ work_model = model.clone()
+
+ sigma_start = model.get_model_object("model_sampling").percent_to_sigma(start_at)
+ sigma_end = model.get_model_object("model_sampling").percent_to_sigma(end_at)
+
+ if mask is not None:
+ mask = mask.to(self.device)
+
+ patch_kwargs = {
+ "weight": weight,
+ "ipadapter": self.instantid,
+ "cond": image_prompt_embeds,
+ "uncond": uncond_image_prompt_embeds,
+ "mask": mask,
+ "sigma_start": sigma_start,
+ "sigma_end": sigma_end,
+ }
+
+ number = 0
+ for id in [4,5,7,8]: # id of input_blocks that have cross attention
+ block_indices = range(2) if id in [4, 5] else range(10) # transformer_depth
+ for index in block_indices:
+ patch_kwargs["module_key"] = str(number*2+1)
+ _set_model_patch_replace(work_model, patch_kwargs, ("input", id, index))
+ number += 1
+ for id in range(6): # id of output_blocks that have cross attention
+ block_indices = range(2) if id in [3, 4, 5] else range(10) # transformer_depth
+ for index in block_indices:
+ patch_kwargs["module_key"] = str(number*2+1)
+ _set_model_patch_replace(work_model, patch_kwargs, ("output", id, index))
+ number += 1
+ for index in range(10):
+ patch_kwargs["module_key"] = str(number*2+1)
+ _set_model_patch_replace(work_model, patch_kwargs, ("middle", 0, index))
+ number += 1
+
+ return(work_model, { "cond": image_prompt_embeds, "uncond": uncond_image_prompt_embeds }, )
+
+class ApplyInstantIDControlNet:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "face_embeds": ("FACE_EMBEDS", ),
+ "control_net": ("CONTROL_NET", ),
+ "image_kps": ("IMAGE", ),
+ "positive": ("CONDITIONING", ),
+ "negative": ("CONDITIONING", ),
+ "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01, }),
+ "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001, }),
+ },
+ "optional": {
+ "mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("CONDITIONING", "CONDITIONING",)
+ RETURN_NAMES = ("positive", "negative", )
+ FUNCTION = "apply_controlnet"
+ CATEGORY = "InstantID"
+
+ def apply_controlnet(self, face_embeds, control_net, image_kps, positive, negative, strength, start_at, end_at, mask=None):
+ self.device = comfy.model_management.get_torch_device()
+
+ if strength == 0:
+ return (positive, negative)
+
+ if mask is not None:
+ mask = mask.to(self.device)
+
+ if mask is not None and len(mask.shape) < 3:
+ mask = mask.unsqueeze(0)
+
+ image_prompt_embeds = face_embeds['cond']
+ uncond_image_prompt_embeds = face_embeds['uncond']
+
+ cnets = {}
+ cond_uncond = []
+ control_hint = image_kps.movedim(-1,1)
+
+ is_cond = True
+ for conditioning in [positive, negative]:
+ c = []
+ for t in conditioning:
+ d = t[1].copy()
+
+ prev_cnet = d.get('control', None)
+ if prev_cnet in cnets:
+ c_net = cnets[prev_cnet]
+ else:
+ c_net = control_net.copy().set_cond_hint(control_hint, strength, (start_at, end_at))
+ c_net.set_previous_controlnet(prev_cnet)
+ cnets[prev_cnet] = c_net
+
+ d['control'] = c_net
+ d['control_apply_to_uncond'] = False
+ d['cross_attn_controlnet'] = image_prompt_embeds.to(comfy.model_management.intermediate_device()) if is_cond else uncond_image_prompt_embeds.to(comfy.model_management.intermediate_device())
+
+ if mask is not None and is_cond:
+ d['mask'] = mask
+ d['set_area_to_bounds'] = False
+
+ n = [t[0], d]
+ c.append(n)
+ cond_uncond.append(c)
+ is_cond = False
+
+ return(cond_uncond[0], cond_uncond[1])
+
+
+NODE_CLASS_MAPPINGS = {
+ "InstantIDModelLoader": InstantIDModelLoader,
+ "InstantIDFaceAnalysis": InstantIDFaceAnalysis,
+ "ApplyInstantID": ApplyInstantID,
+ "ApplyInstantIDAdvanced": ApplyInstantIDAdvanced,
+ "FaceKeypointsPreprocessor": FaceKeypointsPreprocessor,
+
+ "InstantIDAttentionPatch": InstantIDAttentionPatch,
+ "ApplyInstantIDControlNet": ApplyInstantIDControlNet,
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "InstantIDModelLoader": "Load InstantID Model",
+ "InstantIDFaceAnalysis": "InstantID Face Analysis",
+ "ApplyInstantID": "Apply InstantID",
+ "ApplyInstantIDAdvanced": "Apply InstantID Advanced",
+ "FaceKeypointsPreprocessor": "Face Keypoints Preprocessor",
+
+ "InstantIDAttentionPatch": "InstantID Patch Attention",
+ "ApplyInstantIDControlNet": "InstantID Apply ControlNet",
+}
diff --git a/ComfyUI_InstantID/LICENSE b/ComfyUI_InstantID/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..29f81d812f3e768fa89638d1f72920dbfd1413a8
--- /dev/null
+++ b/ComfyUI_InstantID/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/ComfyUI_InstantID/README.md b/ComfyUI_InstantID/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f24fe521ec7e69d29b42e56a2206a82d2bf79c46
--- /dev/null
+++ b/ComfyUI_InstantID/README.md
@@ -0,0 +1,141 @@
+# ComfyUI InstantID (Native Support)
+
+## Translations
+- [简体中文 (Simplified Chinese)](./README.zh-CN.md)
+
+Native [InstantID](https://github.com/InstantID/InstantID) support for [ComfyUI](https://github.com/comfyanonymous/ComfyUI).
+
+This extension differs from the many already available as it doesn't use *diffusers* but instead implements InstantID natively and it fully integrates with ComfyUI.
+
+# Sponsorship
+
+
+
+**[:heart: Github Sponsor](https://github.com/sponsors/cubiq) | [:coin: Paypal](https://paypal.me/matt3o)**
+
+
+
+If you like my work and wish to see updates and new features please consider sponsoring my projects.
+
+- [ComfyUI IPAdapter Plus](https://github.com/cubiq/ComfyUI_IPAdapter_plus)
+- [ComfyUI InstantID (Native)](https://github.com/cubiq/ComfyUI_InstantID)
+- [ComfyUI Essentials](https://github.com/cubiq/ComfyUI_essentials)
+- [ComfyUI FaceAnalysis](https://github.com/cubiq/ComfyUI_FaceAnalysis)
+
+Not to mention the documentation and videos tutorials. Check my **ComfyUI Advanced Understanding** videos on YouTube for example, [part 1](https://www.youtube.com/watch?v=_C7kR2TFIX0) and [part 2](https://www.youtube.com/watch?v=ijqXnW_9gzc)
+
+The only way to keep the code open and free is by sponsoring its development. The more sponsorships the more time I can dedicate to my open source projects.
+
+Please consider a [Github Sponsorship](https://github.com/sponsors/cubiq) or [PayPal donation](https://paypal.me/matt3o) (Matteo "matt3o" Spinelli). For sponsorships of $50+, let me know if you'd like to be mentioned in this readme file, you can find me on [Discord](https://latent.vision/discord) or _matt3o :snail: gmail.com_.
+
+## Important updates
+
+- **2024/02/27:** Added [noise injection](#noise-injection) in the negative embeds.
+
+- **2024/02/26:** Fixed a small but nasty bug. Results will be different and you may need to lower the CFG.
+
+- **2024/02/20:** I refactored the nodes so they are hopefully easier to use. **This is a breaking update**, the previous workflows won't work anymore.
+
+## Basic Workflow
+
+In the `examples` directory you'll find some basic workflows.
+
+![workflow](examples/instantid_basic_workflow.jpg)
+
+## Video Tutorial
+
+
+
+
+
+** :movie_camera: [Introduction to InstantID features](https://youtu.be/wMLiGhogOPE)**
+
+## Installation
+
+**Upgrade ComfyUI to the latest version!**
+
+Download or `git clone` this repository into the `ComfyUI/custom_nodes/` directory or use the Manager.
+
+InstantID requires `insightface`, you need to add it to your libraries together with `onnxruntime` and `onnxruntime-gpu`.
+
+The InsightFace model is **antelopev2** (not the classic buffalo_l). Download the models (for example from [here](https://drive.google.com/file/d/18wEUfMNohBJ4K3Ly5wpTejPfDzp-8fI8/view?usp=sharing) or [here](https://huggingface.co/MonsterMMORPG/tools/tree/main)), unzip and place them in the `ComfyUI/models/insightface/models/antelopev2` directory.
+
+The **main model** can be downloaded from [HuggingFace](https://huggingface.co/InstantX/InstantID/resolve/main/ip-adapter.bin?download=true) and should be placed into the `ComfyUI/models/instantid` directory. (Note that the model is called *ip_adapter* as it is based on the [IPAdapter](https://github.com/tencent-ailab/IP-Adapter)).
+
+You also needs a [controlnet](https://huggingface.co/InstantX/InstantID/resolve/main/ControlNetModel/diffusion_pytorch_model.safetensors?download=true), place it in the ComfyUI controlnet directory.
+
+**Remember at the moment this is only for SDXL.**
+
+## Watermarks!
+
+The training data is full of watermarks, to avoid them to show up in your generations use a resolution slightly different from 1024×1024 (or the standard ones) for example **1016×1016** works pretty well.
+
+## Lower the CFG!
+
+It's important to lower the CFG to at least 4/5 or you can use the `RescaleCFG` node.
+
+## Face keypoints
+
+The person is posed based on the keypoints generated from the reference image. You can use a different pose by sending an image to the `image_kps` input.
+
+
+
+## Noise Injection
+
+The default InstantID implementation seems to really burn the image, I find that by injecting noise to the negative embeds we can mitigate the effect and also increase the likeliness to the reference. The default Apply InstantID node automatically injects 35% noise, if you want to fine tune the effect you can use the Advanced InstantID node.
+
+This is still experimental and may change in the future.
+
+## Additional Controlnets
+
+You can add more controlnets to the generation. An example workflow for depth controlnet is provided.
+
+## Styling with IPAdapter
+
+It's possible to style the composition with IPAdapter. An example is provided.
+
+
+
+## Multi-ID
+
+Multi-ID is supported but the workflow is a bit complicated and the generation slower. I'll check if I can find a better way of doing it. The "hackish" workflow is provided in the example directory.
+
+
+
+## Advanced Node
+
+There's an InstantID advanced node available, at the moment the only difference with the standard one is that you can set the weights for the instantID models and the controlnet separately. It now also includes a noise injection option. It might be helpful for finetuning.
+
+The instantID model influences the composition of about 25%, the rest is the controlnet.
+
+The noise helps reducing the "burn" effect.
+
+## Other notes
+
+It works very well with SDXL Turbo/Lighting. Best results with community's checkpoints.
+
+
+## Current sponsors
+
+It's only thanks to generous sponsors that **the whole community** can enjoy open and free software. Please join me in thanking the following companies and individuals!
+
+### :trophy: Gold sponsors
+
+[![Kaiber.ai](https://f.latent.vision/imgs/kaiber.png)](https://kaiber.ai/) [![InstaSD](https://f.latent.vision/imgs/instasd.png)](https://www.instasd.com/)
+
+### :tada: Silver sponsors
+
+[![OperArt.ai](https://f.latent.vision/imgs/openart.png?r=1)](https://openart.ai/workflows) [![Finetuners](https://f.latent.vision/imgs/finetuners.png)](https://www.finetuners.ai/) [![Comfy.ICU](https://f.latent.vision/imgs/comfyicu.png?r=1)](https://comfy.icu/)
+
+### Other companies supporting my projects
+
+- [RunComfy](https://www.runcomfy.com/) (ComfyUI Cloud)
+
+### Esteemed individuals
+
+- [Øystein Ø. Olsen](https://github.com/FireNeslo)
+- [Jack Gane](https://github.com/ganeJackS)
+- [Nathan Shipley](https://www.nathanshipley.com/)
+- [Dkdnzia](https://github.com/Dkdnzia)
+
+[And all my public and private sponsors!](https://github.com/sponsors/cubiq)
diff --git a/ComfyUI_InstantID/README.zh-CN.md b/ComfyUI_InstantID/README.zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..26a13ce15c98553dc3c7de5f3d1d7b0061a63b90
--- /dev/null
+++ b/ComfyUI_InstantID/README.zh-CN.md
@@ -0,0 +1,137 @@
+# ComfyUI InstantID (原生支持)
+
+[InstantID](https://github.com/InstantID/InstantID) 的原生 [ComfyUI](https://github.com/comfyanonymous/ComfyUI) 支持。
+
+此扩展不同于许多已可用的扩展,因为它不使用 *diffusers*,而是原生实现了 InstantID,并且与 ComfyUI 完全集成。
+
+# 赞助
+
+
+
+**[:heart: Github 赞助](https://github.com/sponsors/cubiq) | [:coin: Paypal](https://paypal.me/matt3o)**
+
+
+
+如果您喜欢我的工作并希望看到更新和新功能,请考虑赞助我的项目。
+
+- [ComfyUI IPAdapter Plus](https://github.com/cubiq/ComfyUI_IPAdapter_plus)
+- [ComfyUI InstantID (原生)](https://github.com/cubiq/ComfyUI_InstantID)
+- [ComfyUI Essentials](https://github.com/cubiq/ComfyUI_essentials)
+- [ComfyUI FaceAnalysis](https://github.com/cubiq/ComfyUI_FaceAnalysis)
+
+更不用说文档和视频教程。可以查看我在 YouTube 上的 **ComfyUI 高级理解** 视频,例如 [第 1 部分](https://www.youtube.com/watch?v=_C7kR2TFIX0) 和 [第 2 部分](https://www.youtube.com/watch?v=ijqXnW_9gzc)。
+
+保持代码开源和免费的唯一方法是通过赞助其开发。赞助越多,我就能投入更多时间在我的开源项目上。
+
+请考虑 [Github 赞助](https://github.com/sponsors/cubiq) 或 [PayPal 捐赠](https://paypal.me/matt3o)(Matteo "matt3o" Spinelli)。对于赞助 $50+ 的人,请告诉我是否希望在此 README 文件中被提及,您可以在 [Discord](https://latent.vision/discord) 或通过 _matt3o :snail: gmail.com_ 联系我。
+
+## 重要更新
+
+- **2024/02/27:** 在负嵌入中添加了[噪声注入](#noise-injection)。
+
+- **2024/02/26:** 修复了一个小但讨厌的错误。结果将有所不同,您可能需要降低 CFG。
+
+- **2024/02/20:** 我重构了节点,希望它们更易于使用。**这是一次重大更新**,以前的工作流将不再可用。
+
+## 基本工作流
+
+在 `examples` 目录中,您会找到一些基本工作流。
+
+![workflow](examples/instantid_basic_workflow.jpg)
+
+## 视频教程
+
+
+
+
+
+** :movie_camera: [InstantID 功能介绍](https://youtu.be/wMLiGhogOPE)**
+
+## 安装
+
+**将 ComfyUI 升级到最新版本!**
+
+下载或 `git clone` 此仓库到 `ComfyUI/custom_nodes/` 目录或使用 Manager。
+
+InstantID 需要 `insightface`,您需要将其添加到您的库中,连同 `onnxruntime` 和 `onnxruntime-gpu`。
+
+InsightFace 模型是 **antelopev2**(不是经典的 buffalo_l)。下载模型(例如从 [这里](https://drive.google.com/file/d/18wEUfMNohBJ4K3Ly5wpTejPfDzp-8fI8/view?usp=sharing) 或 [这里](https://huggingface.co/MonsterMMORPG/tools/tree/main)),解压并将其放置在 `ComfyUI/models/insightface/models/antelopev2` 目录中。
+
+**主模型**可以从 [HuggingFace](https://huggingface.co/InstantX/InstantID/resolve/main/ip-adapter.bin?download=true) 下载,应将其放置在 `ComfyUI/models/instantid` 目录中。(请注意,该模型称为 *ip_adapter*,因为它基于 [IPAdapter](https://github.com/tencent-ailab/IP-Adapter))。
+
+您还需要一个 [controlnet](https://huggingface.co/InstantX/InstantID/resolve/main/ControlNetModel/diffusion_pytorch_model.safetensors?download=true),将其放置在 ComfyUI controlnet 目录中。
+
+**请记住,目前这仅适用于 SDXL。**
+
+## 水印!
+
+训练数据中充满了水印,为避免水印出现在您的生成中,请使用与 1024×1024(或标准尺寸)略有不同的分辨率,例如 **1016×1016** 效果很好。
+
+## 降低 CFG!
+
+重要的是将 CFG 降低到至少 4/5,或者您可以使用 `RescaleCFG` 节点。
+
+## 面部关键点
+
+人物的姿势是基于从参考图像生成的关键点。您可以通过向 `image_kps` 输入发送图像来使用不同的姿势。
+
+
+
+## 噪声注入
+
+默认的 InstantID 实现似乎真的“烧坏”了图像,我发现通过向负嵌入中注入噪声,我们可以缓解这一效果,并增加与参考的相似性。默认的 Apply InstantID 节点自动注入 35% 的噪声,如果您想微调效果,可以使用 Advanced InstantID 节点。
+
+这仍然是实验性的,可能会在未来发生变化。
+
+## 额外的 Controlnets
+
+您可以向生成中添加更多 controlnets。提供了一个用于深度 controlnet 的示例工作流。
+
+## 使用 IPAdapter 进行样式化
+
+可以使用 IPAdapter 对构图进行样式化。提供了一个示例。
+
+
+
+## 多-ID 支持
+
+支持多 ID,但工作流有点复杂,生成速度较慢。我会检查是否可以找到更好的方法。示例工作流在 examples 目录中提供。
+
+
+
+## 高级节点
+
+目前有一个高级的 InstantID 节点,当前与标准节点的唯一区别是您可以分别设置 instantID 模型和 controlnet 的权重。它现在还包括一个噪声注入选项。对于微调可能很有帮助。
+
+instantID 模型对构图的影响约为 25%,其余的是 controlnet。
+
+噪声有助于减少“燃烧”效果。
+
+## 其他注意事项
+
+它与 SDXL Turbo/Lighting 非常兼容。使用社区的检查点效果最好。
+
+## 当前赞助商
+
+正是由于慷慨的赞助商,**整个社区**才能享受开源和免费软件。请与我一起感谢以下公司和个人!
+
+### :trophy: 金牌赞助商
+
+[![Kaiber.ai](https://f.latent.vision/imgs/kaiber.png)](https://kaiber.ai/) [![InstaSD](https://f.latent.vision/imgs/instasd.png)](https://www.instasd.com/)
+
+### :tada: 银牌赞助商
+
+[![OperArt.ai](https://f.latent.vision/imgs/openart.png?r=1)](https://openart.ai/workflows) [![Finetuners](https://f.latent.vision/imgs/finetuners.png)](https://www.finetuners.ai/) [![Comfy.ICU](https://f.latent.vision/imgs/comfyicu.png?r=1)](https://comfy.icu/)
+
+### 其他支持我项目的公司
+
+- [RunComfy](https://www.runcomfy.com/) (ComfyUI 云)
+
+### 尊敬的个人
+
+- [Øystein Ø. Olsen](https://github.com/FireNeslo)
+- [Jack Gane](https://github.com/ganeJackS)
+- [Nathan Shipley](https://www.nathanshipley.com/)
+- [Dkdnzia](https://github.com/Dkdnzia)
+
+[以及所有我的公开和私密赞助商!](https://github.com/sponsors/cubiq)
diff --git a/ComfyUI_InstantID/__init__.py b/ComfyUI_InstantID/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1167f34c0bf8cd4c7812632239d733b2620625f
--- /dev/null
+++ b/ComfyUI_InstantID/__init__.py
@@ -0,0 +1,3 @@
+from .InstantID import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
+
+__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
diff --git a/ComfyUI_InstantID/__pycache__/CrossAttentionPatch.cpython-312.pyc b/ComfyUI_InstantID/__pycache__/CrossAttentionPatch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b108f710b605945d6c2d500232251a6886ed60ef
Binary files /dev/null and b/ComfyUI_InstantID/__pycache__/CrossAttentionPatch.cpython-312.pyc differ
diff --git a/ComfyUI_InstantID/__pycache__/InstantID.cpython-312.pyc b/ComfyUI_InstantID/__pycache__/InstantID.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15ea39b70d135fc58a7ecf803ffa079abcee8510
Binary files /dev/null and b/ComfyUI_InstantID/__pycache__/InstantID.cpython-312.pyc differ
diff --git a/ComfyUI_InstantID/__pycache__/__init__.cpython-312.pyc b/ComfyUI_InstantID/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a78ddbb8a63bc2e0002452e1af3ace322e50c73a
Binary files /dev/null and b/ComfyUI_InstantID/__pycache__/__init__.cpython-312.pyc differ
diff --git a/ComfyUI_InstantID/__pycache__/resampler.cpython-312.pyc b/ComfyUI_InstantID/__pycache__/resampler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef8fb503d57c9889246e6e9a974cbef7add65334
Binary files /dev/null and b/ComfyUI_InstantID/__pycache__/resampler.cpython-312.pyc differ
diff --git a/ComfyUI_InstantID/__pycache__/utils.cpython-312.pyc b/ComfyUI_InstantID/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95448ce3e3ec86115c787d9ba98916e2dd825a60
Binary files /dev/null and b/ComfyUI_InstantID/__pycache__/utils.cpython-312.pyc differ
diff --git a/ComfyUI_InstantID/examples/InstantID_IPAdapter.json b/ComfyUI_InstantID/examples/InstantID_IPAdapter.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dc50a4ac48063d53c229a8dbffc273e08398b8e
--- /dev/null
+++ b/ComfyUI_InstantID/examples/InstantID_IPAdapter.json
@@ -0,0 +1,861 @@
+{
+ "last_node_id": 72,
+ "last_link_id": 231,
+ "nodes": [
+ {
+ "id": 11,
+ "type": "InstantIDModelLoader",
+ "pos": [
+ 560,
+ 70
+ ],
+ "size": {
+ "0": 238.72393798828125,
+ "1": 58
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INSTANTID",
+ "type": "INSTANTID",
+ "links": [
+ 197
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDModelLoader"
+ },
+ "widgets_values": [
+ "ip-adapter.bin"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "InstantIDFaceAnalysis",
+ "pos": [
+ 570,
+ 180
+ ],
+ "size": {
+ "0": 227.09793090820312,
+ "1": 58
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "FACEANALYSIS",
+ "type": "FACEANALYSIS",
+ "links": [
+ 198
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDFaceAnalysis"
+ },
+ "widgets_values": [
+ "CPU"
+ ]
+ },
+ {
+ "id": 16,
+ "type": "ControlNetLoader",
+ "pos": [
+ 560,
+ 290
+ ],
+ "size": {
+ "0": 250.07241821289062,
+ "1": 58
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CONTROL_NET",
+ "type": "CONTROL_NET",
+ "links": [
+ 199
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetLoader"
+ },
+ "widgets_values": [
+ "instantid/diffusion_pytorch_model.safetensors"
+ ]
+ },
+ {
+ "id": 15,
+ "type": "PreviewImage",
+ "pos": [
+ 1910,
+ 290
+ ],
+ "size": {
+ "0": 584.0855712890625,
+ "1": 610.4592895507812
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 19
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 910,
+ 540
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1016,
+ 1016,
+ 1
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1910,
+ 200
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 19
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 39,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 430
+ ],
+ "size": {
+ "0": 291.9967346191406,
+ "1": 128.62518310546875
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 122
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "comic character. graphic illustration, comic art, graphic novel art, vibrant, highly detailed"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 620
+ ],
+ "size": {
+ "0": 286.3603515625,
+ "1": 112.35245513916016
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 123
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 204
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "photograph, deformed, glitch, noisy, realistic, stock photo"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 70,
+ 520
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 206
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 122,
+ 123
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sdxl/AlbedoBaseXL.safetensors"
+ ]
+ },
+ {
+ "id": 13,
+ "type": "LoadImage",
+ "pos": [
+ 290,
+ 70
+ ],
+ "size": {
+ "0": 210,
+ "1": 314
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 214
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "joseph-gonzalez-iFgRcqHznqg-unsplash.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1540,
+ 200
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 231
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 201
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1631591432,
+ "fixed",
+ 30,
+ 4.5,
+ "ddpm",
+ "karras",
+ 1
+ ]
+ },
+ {
+ "id": 68,
+ "type": "IPAdapterModelLoader",
+ "pos": [
+ 830,
+ -500
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IPADAPTER",
+ "type": "IPADAPTER",
+ "links": [
+ 227
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "IPAdapterModelLoader"
+ },
+ "widgets_values": [
+ "ip-adapter-plus_sdxl_vit-h.safetensors"
+ ]
+ },
+ {
+ "id": 60,
+ "type": "ApplyInstantID",
+ "pos": [
+ 910,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 266
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "instantid",
+ "type": "INSTANTID",
+ "link": 197
+ },
+ {
+ "name": "insightface",
+ "type": "FACEANALYSIS",
+ "link": 198
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 199
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 214
+ },
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 206
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 203
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 204
+ },
+ {
+ "name": "image_kps",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 230
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "POSITIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "NEGATIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 201
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ApplyInstantID"
+ },
+ "widgets_values": [
+ 0.8,
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 70,
+ "type": "CLIPVisionLoader",
+ "pos": [
+ 830,
+ -390
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CLIP_VISION",
+ "type": "CLIP_VISION",
+ "links": [
+ 228
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPVisionLoader"
+ },
+ "widgets_values": [
+ "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors"
+ ]
+ },
+ {
+ "id": 71,
+ "type": "LoadImage",
+ "pos": [
+ 830,
+ -280
+ ],
+ "size": {
+ "0": 315,
+ "1": 314
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 229
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "anime_colorful.png",
+ "image"
+ ]
+ },
+ {
+ "id": 72,
+ "type": "IPAdapterAdvanced",
+ "pos": [
+ 1226,
+ -337
+ ],
+ "size": {
+ "0": 315,
+ "1": 278
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 230
+ },
+ {
+ "name": "ipadapter",
+ "type": "IPADAPTER",
+ "link": 227
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 229
+ },
+ {
+ "name": "image_negative",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "attn_mask",
+ "type": "MASK",
+ "link": null
+ },
+ {
+ "name": "clip_vision",
+ "type": "CLIP_VISION",
+ "link": 228
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 231
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "IPAdapterAdvanced"
+ },
+ "widgets_values": [
+ 0.5,
+ "linear",
+ "concat",
+ 0,
+ 1,
+ "V only"
+ ]
+ }
+ ],
+ "links": [
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ "LATENT"
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ "LATENT"
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 19,
+ 8,
+ 0,
+ 15,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 122,
+ 4,
+ 1,
+ 39,
+ 0,
+ "CLIP"
+ ],
+ [
+ 123,
+ 4,
+ 1,
+ 40,
+ 0,
+ "CLIP"
+ ],
+ [
+ 197,
+ 11,
+ 0,
+ 60,
+ 0,
+ "INSTANTID"
+ ],
+ [
+ 198,
+ 38,
+ 0,
+ 60,
+ 1,
+ "FACEANALYSIS"
+ ],
+ [
+ 199,
+ 16,
+ 0,
+ 60,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 200,
+ 60,
+ 1,
+ 3,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 201,
+ 60,
+ 2,
+ 3,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 203,
+ 39,
+ 0,
+ 60,
+ 5,
+ "CONDITIONING"
+ ],
+ [
+ 204,
+ 40,
+ 0,
+ 60,
+ 6,
+ "CONDITIONING"
+ ],
+ [
+ 206,
+ 4,
+ 0,
+ 60,
+ 4,
+ "MODEL"
+ ],
+ [
+ 214,
+ 13,
+ 0,
+ 60,
+ 3,
+ "IMAGE"
+ ],
+ [
+ 227,
+ 68,
+ 0,
+ 72,
+ 1,
+ "IPADAPTER"
+ ],
+ [
+ 228,
+ 70,
+ 0,
+ 72,
+ 5,
+ "CLIP_VISION"
+ ],
+ [
+ 229,
+ 71,
+ 0,
+ 72,
+ 2,
+ "IMAGE"
+ ],
+ [
+ 230,
+ 60,
+ 0,
+ 72,
+ 0,
+ "MODEL"
+ ],
+ [
+ 231,
+ 72,
+ 0,
+ 3,
+ 0,
+ "MODEL"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/ComfyUI_InstantID/examples/InstantID_basic.json b/ComfyUI_InstantID/examples/InstantID_basic.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5f0b7334836f501b0c32a92eb4ef003f2134527
--- /dev/null
+++ b/ComfyUI_InstantID/examples/InstantID_basic.json
@@ -0,0 +1,657 @@
+{
+ "last_node_id": 66,
+ "last_link_id": 220,
+ "nodes": [
+ {
+ "id": 11,
+ "type": "InstantIDModelLoader",
+ "pos": [
+ 560,
+ 70
+ ],
+ "size": {
+ "0": 238.72393798828125,
+ "1": 58
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INSTANTID",
+ "type": "INSTANTID",
+ "links": [
+ 197
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDModelLoader"
+ },
+ "widgets_values": [
+ "ip-adapter.bin"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "InstantIDFaceAnalysis",
+ "pos": [
+ 570,
+ 180
+ ],
+ "size": {
+ "0": 227.09793090820312,
+ "1": 58
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "FACEANALYSIS",
+ "type": "FACEANALYSIS",
+ "links": [
+ 198
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDFaceAnalysis"
+ },
+ "widgets_values": [
+ "CPU"
+ ]
+ },
+ {
+ "id": 16,
+ "type": "ControlNetLoader",
+ "pos": [
+ 560,
+ 290
+ ],
+ "size": {
+ "0": 250.07241821289062,
+ "1": 58
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CONTROL_NET",
+ "type": "CONTROL_NET",
+ "links": [
+ 199
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetLoader"
+ },
+ "widgets_values": [
+ "instantid/diffusion_pytorch_model.safetensors"
+ ]
+ },
+ {
+ "id": 15,
+ "type": "PreviewImage",
+ "pos": [
+ 1670,
+ 300
+ ],
+ "size": {
+ "0": 584.0855712890625,
+ "1": 610.4592895507812
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 19
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 910,
+ 540
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1016,
+ 1016,
+ 1
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1670,
+ 210
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 19
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 60,
+ "type": "ApplyInstantID",
+ "pos": [
+ 910,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 266
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "instantid",
+ "type": "INSTANTID",
+ "link": 197
+ },
+ {
+ "name": "insightface",
+ "type": "FACEANALYSIS",
+ "link": 198
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 199
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 214
+ },
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 206
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 203
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 204
+ },
+ {
+ "name": "image_kps",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 220
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "POSITIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "NEGATIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 201
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ApplyInstantID"
+ },
+ "widgets_values": [
+ 0.8,
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 39,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 430
+ ],
+ "size": {
+ "0": 291.9967346191406,
+ "1": 128.62518310546875
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 122
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "comic character. graphic illustration, comic art, graphic novel art, vibrant, highly detailed"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 620
+ ],
+ "size": {
+ "0": 286.3603515625,
+ "1": 112.35245513916016
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 123
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 204
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "photograph, deformed, glitch, noisy, realistic, stock photo"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 70,
+ 520
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 206
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 122,
+ 123
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sdxl/AlbedoBaseXL.safetensors"
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1300,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 220
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 201
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1631591050,
+ "fixed",
+ 30,
+ 4.5,
+ "ddpm",
+ "karras",
+ 1
+ ]
+ },
+ {
+ "id": 13,
+ "type": "LoadImage",
+ "pos": [
+ 290,
+ 70
+ ],
+ "size": {
+ "0": 210,
+ "1": 314
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 214
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "joseph-gonzalez-iFgRcqHznqg-unsplash.jpg",
+ "image"
+ ]
+ }
+ ],
+ "links": [
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ "LATENT"
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ "LATENT"
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 19,
+ 8,
+ 0,
+ 15,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 122,
+ 4,
+ 1,
+ 39,
+ 0,
+ "CLIP"
+ ],
+ [
+ 123,
+ 4,
+ 1,
+ 40,
+ 0,
+ "CLIP"
+ ],
+ [
+ 197,
+ 11,
+ 0,
+ 60,
+ 0,
+ "INSTANTID"
+ ],
+ [
+ 198,
+ 38,
+ 0,
+ 60,
+ 1,
+ "FACEANALYSIS"
+ ],
+ [
+ 199,
+ 16,
+ 0,
+ 60,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 200,
+ 60,
+ 1,
+ 3,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 201,
+ 60,
+ 2,
+ 3,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 203,
+ 39,
+ 0,
+ 60,
+ 5,
+ "CONDITIONING"
+ ],
+ [
+ 204,
+ 40,
+ 0,
+ 60,
+ 6,
+ "CONDITIONING"
+ ],
+ [
+ 206,
+ 4,
+ 0,
+ 60,
+ 4,
+ "MODEL"
+ ],
+ [
+ 214,
+ 13,
+ 0,
+ 60,
+ 3,
+ "IMAGE"
+ ],
+ [
+ 220,
+ 60,
+ 0,
+ 3,
+ 0,
+ "MODEL"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/ComfyUI_InstantID/examples/InstantID_depth.json b/ComfyUI_InstantID/examples/InstantID_depth.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0937975a46d376b2efadef12b95bafc17aae558
--- /dev/null
+++ b/ComfyUI_InstantID/examples/InstantID_depth.json
@@ -0,0 +1,881 @@
+{
+ "last_node_id": 78,
+ "last_link_id": 239,
+ "nodes": [
+ {
+ "id": 11,
+ "type": "InstantIDModelLoader",
+ "pos": [
+ 560,
+ 70
+ ],
+ "size": {
+ "0": 238.72393798828125,
+ "1": 58
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INSTANTID",
+ "type": "INSTANTID",
+ "links": [
+ 197
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDModelLoader"
+ },
+ "widgets_values": [
+ "ip-adapter.bin"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "InstantIDFaceAnalysis",
+ "pos": [
+ 570,
+ 180
+ ],
+ "size": {
+ "0": 227.09793090820312,
+ "1": 58
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "FACEANALYSIS",
+ "type": "FACEANALYSIS",
+ "links": [
+ 198
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDFaceAnalysis"
+ },
+ "widgets_values": [
+ "CPU"
+ ]
+ },
+ {
+ "id": 16,
+ "type": "ControlNetLoader",
+ "pos": [
+ 560,
+ 290
+ ],
+ "size": {
+ "0": 250.07241821289062,
+ "1": 58
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CONTROL_NET",
+ "type": "CONTROL_NET",
+ "links": [
+ 199
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetLoader"
+ },
+ "widgets_values": [
+ "instantid/diffusion_pytorch_model.safetensors"
+ ]
+ },
+ {
+ "id": 39,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 430
+ ],
+ "size": {
+ "0": 291.9967346191406,
+ "1": 128.62518310546875
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 122
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "comic character. graphic illustration, comic art, graphic novel art, vibrant, highly detailed"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 620
+ ],
+ "size": {
+ "0": 286.3603515625,
+ "1": 112.35245513916016
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 123
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 204
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "photograph, deformed, glitch, noisy, realistic, stock photo"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 70,
+ 520
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 206
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 122,
+ 123
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sdxl/AlbedoBaseXL.safetensors"
+ ]
+ },
+ {
+ "id": 60,
+ "type": "ApplyInstantID",
+ "pos": [
+ 910,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 266
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "instantid",
+ "type": "INSTANTID",
+ "link": 197
+ },
+ {
+ "name": "insightface",
+ "type": "FACEANALYSIS",
+ "link": 198
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 199
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 214
+ },
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 206
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 203
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 204
+ },
+ {
+ "name": "image_kps",
+ "type": "IMAGE",
+ "link": 236
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 227
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "POSITIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 229
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "NEGATIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 228
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ApplyInstantID"
+ },
+ "widgets_values": [
+ 0.8,
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 15,
+ "type": "PreviewImage",
+ "pos": [
+ 1937,
+ 321
+ ],
+ "size": {
+ "0": 584.0855712890625,
+ "1": 610.4592895507812
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 19
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1940,
+ 207
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 19
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 910,
+ 540
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1016,
+ 1016,
+ 1
+ ]
+ },
+ {
+ "id": 13,
+ "type": "LoadImage",
+ "pos": [
+ 290,
+ 70
+ ],
+ "size": {
+ "0": 210,
+ "1": 314
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 214
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "face4.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 73,
+ "type": "ControlNetLoader",
+ "pos": [
+ 909,
+ 706
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CONTROL_NET",
+ "type": "CONTROL_NET",
+ "links": [
+ 232
+ ],
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetLoader"
+ },
+ "widgets_values": [
+ "control-lora/control-lora-depth-rank256.safetensors"
+ ]
+ },
+ {
+ "id": 74,
+ "type": "LoadImage",
+ "pos": [
+ 508,
+ 816
+ ],
+ "size": {
+ "0": 315,
+ "1": 314.0000305175781
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 236,
+ 238
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "666561.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 72,
+ "type": "ControlNetApplyAdvanced",
+ "pos": [
+ 1284,
+ 416
+ ],
+ "size": {
+ "0": 226.8000030517578,
+ "1": 166
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 229
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 228
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 232,
+ "slot_index": 2
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 239
+ }
+ ],
+ "outputs": [
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 230
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 231
+ ],
+ "shape": 3,
+ "slot_index": 1
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetApplyAdvanced"
+ },
+ "widgets_values": [
+ 0.65,
+ 0,
+ 0.35000000000000003
+ ]
+ },
+ {
+ "id": 77,
+ "type": "Zoe-DepthMapPreprocessor",
+ "pos": [
+ 1009,
+ 839
+ ],
+ "size": [
+ 210,
+ 58
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 238
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 239
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Zoe-DepthMapPreprocessor"
+ },
+ "widgets_values": [
+ 1024
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1570,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 227
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 230
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 231
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1631592172,
+ "fixed",
+ 30,
+ 4.5,
+ "ddpm",
+ "karras",
+ 1
+ ]
+ }
+ ],
+ "links": [
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ "LATENT"
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ "LATENT"
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 19,
+ 8,
+ 0,
+ 15,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 122,
+ 4,
+ 1,
+ 39,
+ 0,
+ "CLIP"
+ ],
+ [
+ 123,
+ 4,
+ 1,
+ 40,
+ 0,
+ "CLIP"
+ ],
+ [
+ 197,
+ 11,
+ 0,
+ 60,
+ 0,
+ "INSTANTID"
+ ],
+ [
+ 198,
+ 38,
+ 0,
+ 60,
+ 1,
+ "FACEANALYSIS"
+ ],
+ [
+ 199,
+ 16,
+ 0,
+ 60,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 203,
+ 39,
+ 0,
+ 60,
+ 5,
+ "CONDITIONING"
+ ],
+ [
+ 204,
+ 40,
+ 0,
+ 60,
+ 6,
+ "CONDITIONING"
+ ],
+ [
+ 206,
+ 4,
+ 0,
+ 60,
+ 4,
+ "MODEL"
+ ],
+ [
+ 214,
+ 13,
+ 0,
+ 60,
+ 3,
+ "IMAGE"
+ ],
+ [
+ 227,
+ 60,
+ 0,
+ 3,
+ 0,
+ "MODEL"
+ ],
+ [
+ 228,
+ 60,
+ 2,
+ 72,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 229,
+ 60,
+ 1,
+ 72,
+ 0,
+ "CONDITIONING"
+ ],
+ [
+ 230,
+ 72,
+ 0,
+ 3,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 231,
+ 72,
+ 1,
+ 3,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 232,
+ 73,
+ 0,
+ 72,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 236,
+ 74,
+ 0,
+ 60,
+ 7,
+ "IMAGE"
+ ],
+ [
+ 238,
+ 74,
+ 0,
+ 77,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 239,
+ 77,
+ 0,
+ 72,
+ 3,
+ "IMAGE"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/ComfyUI_InstantID/examples/InstantID_multi_id.json b/ComfyUI_InstantID/examples/InstantID_multi_id.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b870baafced0fa94f13b1f1e7efa43c1d2af67a
--- /dev/null
+++ b/ComfyUI_InstantID/examples/InstantID_multi_id.json
@@ -0,0 +1,1364 @@
+{
+ "last_node_id": 92,
+ "last_link_id": 290,
+ "nodes": [
+ {
+ "id": 15,
+ "type": "PreviewImage",
+ "pos": [
+ 2160,
+ -150
+ ],
+ "size": {
+ "0": 584.0855712890625,
+ "1": 610.4592895507812
+ },
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 19
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 2170,
+ -270
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 254
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 19
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 81,
+ "type": "Reroute",
+ "pos": [
+ 1980,
+ 120
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 253
+ }
+ ],
+ "outputs": [
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 254
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": true,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 38,
+ "type": "InstantIDFaceAnalysis",
+ "pos": [
+ -210,
+ -40
+ ],
+ "size": [
+ 210,
+ 60
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "FACEANALYSIS",
+ "type": "FACEANALYSIS",
+ "links": [
+ 198,
+ 239
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDFaceAnalysis"
+ },
+ "widgets_values": [
+ "CPU"
+ ]
+ },
+ {
+ "id": 16,
+ "type": "ControlNetLoader",
+ "pos": [
+ -210,
+ 70
+ ],
+ "size": [
+ 210,
+ 60
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CONTROL_NET",
+ "type": "CONTROL_NET",
+ "links": [
+ 199,
+ 240
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetLoader"
+ },
+ "widgets_values": [
+ "instantid/diffusion_pytorch_model.safetensors"
+ ]
+ },
+ {
+ "id": 79,
+ "type": "ConditioningCombine",
+ "pos": [
+ 1410,
+ -190
+ ],
+ "size": [
+ 228.39999389648438,
+ 46
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "conditioning_1",
+ "type": "CONDITIONING",
+ "link": 247
+ },
+ {
+ "name": "conditioning_2",
+ "type": "CONDITIONING",
+ "link": 248
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 249
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ConditioningCombine"
+ }
+ },
+ {
+ "id": 84,
+ "type": "ImageFlip+",
+ "pos": [
+ 990,
+ -210
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 258
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 259
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageFlip+"
+ },
+ "widgets_values": [
+ "x"
+ ]
+ },
+ {
+ "id": 13,
+ "type": "LoadImage",
+ "pos": [
+ 715,
+ 35
+ ],
+ "size": [
+ 213.36950471073226,
+ 296.38119750842566
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 214
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "face4.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 88,
+ "type": "MaskFlip+",
+ "pos": [
+ 990,
+ -110
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 263
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 264
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskFlip+"
+ },
+ "widgets_values": [
+ "x"
+ ]
+ },
+ {
+ "id": 78,
+ "type": "LoadImage",
+ "pos": [
+ 714,
+ -512
+ ],
+ "size": [
+ 210,
+ 314
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 246
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "joseph-gonzalez-iFgRcqHznqg-unsplash.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 85,
+ "type": "SolidMask",
+ "pos": [
+ 970,
+ 510
+ ],
+ "size": [
+ 210,
+ 106
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 260
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SolidMask"
+ },
+ "widgets_values": [
+ 0,
+ 1280,
+ 960
+ ]
+ },
+ {
+ "id": 11,
+ "type": "InstantIDModelLoader",
+ "pos": [
+ -210,
+ -150
+ ],
+ "size": [
+ 210,
+ 60
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INSTANTID",
+ "type": "INSTANTID",
+ "links": [
+ 197,
+ 238
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDModelLoader"
+ },
+ "widgets_values": [
+ "ip-adapter.bin"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -312,
+ 198
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 206
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 122,
+ 123,
+ 266
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 253
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sdxl/AlbedoBaseXL.safetensors"
+ ]
+ },
+ {
+ "id": 87,
+ "type": "MaskComposite",
+ "pos": [
+ 1232,
+ 583
+ ],
+ "size": [
+ 210,
+ 126
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "destination",
+ "type": "MASK",
+ "link": 260
+ },
+ {
+ "name": "source",
+ "type": "MASK",
+ "link": 261
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 262,
+ 263
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskComposite"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ "add"
+ ]
+ },
+ {
+ "id": 86,
+ "type": "SolidMask",
+ "pos": [
+ 970,
+ 660
+ ],
+ "size": {
+ "0": 210,
+ "1": 106
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 261
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SolidMask"
+ },
+ "widgets_values": [
+ 1,
+ 640,
+ 960
+ ]
+ },
+ {
+ "id": 82,
+ "type": "LoadImage",
+ "pos": [
+ 591,
+ 511
+ ],
+ "size": [
+ 315,
+ 314.0000190734863
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 257,
+ 258
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "pose (1).jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 146,
+ 487
+ ],
+ "size": {
+ "0": 286.3603515625,
+ "1": 112.35245513916016
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 123
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 204,
+ 278
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "photograph, deformed, glitch, noisy, realistic, stock photo, naked"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 1431,
+ 20
+ ],
+ "size": [
+ 210,
+ 106
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1280,
+ 960,
+ 1
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1730,
+ -180
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 256
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 249
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 288
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1631594039,
+ "fixed",
+ 30,
+ 4.5,
+ "ddpm",
+ "normal",
+ 1
+ ]
+ },
+ {
+ "id": 80,
+ "type": "ConditioningCombine",
+ "pos": [
+ 1410,
+ -90
+ ],
+ "size": {
+ "0": 228.39999389648438,
+ "1": 46
+ },
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "conditioning_1",
+ "type": "CONDITIONING",
+ "link": 290
+ },
+ {
+ "name": "conditioning_2",
+ "type": "CONDITIONING",
+ "link": 287
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 288
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ConditioningCombine"
+ }
+ },
+ {
+ "id": 77,
+ "type": "ApplyInstantID",
+ "pos": [
+ 990,
+ -528
+ ],
+ "size": {
+ "0": 315,
+ "1": 266
+ },
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "instantid",
+ "type": "INSTANTID",
+ "link": 238
+ },
+ {
+ "name": "insightface",
+ "type": "FACEANALYSIS",
+ "link": 239
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 240
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 246
+ },
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 255
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 272
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 278
+ },
+ {
+ "name": "image_kps",
+ "type": "IMAGE",
+ "link": 259
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 264
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 256
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "POSITIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 247
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "NEGATIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 290
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ApplyInstantID"
+ },
+ "widgets_values": [
+ 0.8,
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 60,
+ "type": "ApplyInstantID",
+ "pos": [
+ 991,
+ 73
+ ],
+ "size": {
+ "0": 315,
+ "1": 266
+ },
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "instantid",
+ "type": "INSTANTID",
+ "link": 197
+ },
+ {
+ "name": "insightface",
+ "type": "FACEANALYSIS",
+ "link": 198
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 199
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 214
+ },
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 206
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 203
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 204
+ },
+ {
+ "name": "image_kps",
+ "type": "IMAGE",
+ "link": 257
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 262
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 255
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "POSITIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 248
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "NEGATIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 287
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ApplyInstantID"
+ },
+ "widgets_values": [
+ 0.9,
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 89,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 314,
+ -421
+ ],
+ "size": {
+ "0": 291.9967346191406,
+ "1": 128.62518310546875
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 266
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 272
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "comic male character. graphic illustration, comic art, graphic novel art, vibrant, highly detailed. New York background"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 39,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 309,
+ 171
+ ],
+ "size": {
+ "0": 291.9967346191406,
+ "1": 128.62518310546875
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 122
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "comic female character. graphic illustration, comic art, graphic novel art, vibrant, highly detailed. New York background"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ }
+ ],
+ "links": [
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ "LATENT"
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ "LATENT"
+ ],
+ [
+ 19,
+ 8,
+ 0,
+ 15,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 122,
+ 4,
+ 1,
+ 39,
+ 0,
+ "CLIP"
+ ],
+ [
+ 123,
+ 4,
+ 1,
+ 40,
+ 0,
+ "CLIP"
+ ],
+ [
+ 197,
+ 11,
+ 0,
+ 60,
+ 0,
+ "INSTANTID"
+ ],
+ [
+ 198,
+ 38,
+ 0,
+ 60,
+ 1,
+ "FACEANALYSIS"
+ ],
+ [
+ 199,
+ 16,
+ 0,
+ 60,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 203,
+ 39,
+ 0,
+ 60,
+ 5,
+ "CONDITIONING"
+ ],
+ [
+ 204,
+ 40,
+ 0,
+ 60,
+ 6,
+ "CONDITIONING"
+ ],
+ [
+ 206,
+ 4,
+ 0,
+ 60,
+ 4,
+ "MODEL"
+ ],
+ [
+ 214,
+ 13,
+ 0,
+ 60,
+ 3,
+ "IMAGE"
+ ],
+ [
+ 238,
+ 11,
+ 0,
+ 77,
+ 0,
+ "INSTANTID"
+ ],
+ [
+ 239,
+ 38,
+ 0,
+ 77,
+ 1,
+ "FACEANALYSIS"
+ ],
+ [
+ 240,
+ 16,
+ 0,
+ 77,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 246,
+ 78,
+ 0,
+ 77,
+ 3,
+ "IMAGE"
+ ],
+ [
+ 247,
+ 77,
+ 1,
+ 79,
+ 0,
+ "CONDITIONING"
+ ],
+ [
+ 248,
+ 60,
+ 1,
+ 79,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 249,
+ 79,
+ 0,
+ 3,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 253,
+ 4,
+ 2,
+ 81,
+ 0,
+ "*"
+ ],
+ [
+ 254,
+ 81,
+ 0,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 255,
+ 60,
+ 0,
+ 77,
+ 4,
+ "MODEL"
+ ],
+ [
+ 256,
+ 77,
+ 0,
+ 3,
+ 0,
+ "MODEL"
+ ],
+ [
+ 257,
+ 82,
+ 0,
+ 60,
+ 7,
+ "IMAGE"
+ ],
+ [
+ 258,
+ 82,
+ 0,
+ 84,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 259,
+ 84,
+ 0,
+ 77,
+ 7,
+ "IMAGE"
+ ],
+ [
+ 260,
+ 85,
+ 0,
+ 87,
+ 0,
+ "MASK"
+ ],
+ [
+ 261,
+ 86,
+ 0,
+ 87,
+ 1,
+ "MASK"
+ ],
+ [
+ 262,
+ 87,
+ 0,
+ 60,
+ 8,
+ "MASK"
+ ],
+ [
+ 263,
+ 87,
+ 0,
+ 88,
+ 0,
+ "MASK"
+ ],
+ [
+ 264,
+ 88,
+ 0,
+ 77,
+ 8,
+ "MASK"
+ ],
+ [
+ 266,
+ 4,
+ 1,
+ 89,
+ 0,
+ "CLIP"
+ ],
+ [
+ 272,
+ 89,
+ 0,
+ 77,
+ 5,
+ "CONDITIONING"
+ ],
+ [
+ 278,
+ 40,
+ 0,
+ 77,
+ 6,
+ "CONDITIONING"
+ ],
+ [
+ 287,
+ 60,
+ 2,
+ 80,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 288,
+ 80,
+ 0,
+ 3,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 290,
+ 77,
+ 2,
+ 80,
+ 0,
+ "CONDITIONING"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/ComfyUI_InstantID/examples/InstantID_posed.json b/ComfyUI_InstantID/examples/InstantID_posed.json
new file mode 100644
index 0000000000000000000000000000000000000000..e060254b7b820da9df9a8e04d44d09e678e78a34
--- /dev/null
+++ b/ComfyUI_InstantID/examples/InstantID_posed.json
@@ -0,0 +1,704 @@
+{
+ "last_node_id": 67,
+ "last_link_id": 221,
+ "nodes": [
+ {
+ "id": 11,
+ "type": "InstantIDModelLoader",
+ "pos": [
+ 560,
+ 70
+ ],
+ "size": {
+ "0": 238.72393798828125,
+ "1": 58
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INSTANTID",
+ "type": "INSTANTID",
+ "links": [
+ 197
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDModelLoader"
+ },
+ "widgets_values": [
+ "ip-adapter.bin"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "InstantIDFaceAnalysis",
+ "pos": [
+ 570,
+ 180
+ ],
+ "size": {
+ "0": 227.09793090820312,
+ "1": 58
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "FACEANALYSIS",
+ "type": "FACEANALYSIS",
+ "links": [
+ 198
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InstantIDFaceAnalysis"
+ },
+ "widgets_values": [
+ "CPU"
+ ]
+ },
+ {
+ "id": 16,
+ "type": "ControlNetLoader",
+ "pos": [
+ 560,
+ 290
+ ],
+ "size": {
+ "0": 250.07241821289062,
+ "1": 58
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "CONTROL_NET",
+ "type": "CONTROL_NET",
+ "links": [
+ 199
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ControlNetLoader"
+ },
+ "widgets_values": [
+ "instantid/diffusion_pytorch_model.safetensors"
+ ]
+ },
+ {
+ "id": 15,
+ "type": "PreviewImage",
+ "pos": [
+ 1670,
+ 300
+ ],
+ "size": {
+ "0": 584.0855712890625,
+ "1": 610.4592895507812
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 19
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 910,
+ 540
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1016,
+ 1016,
+ 1
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1670,
+ 210
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 19
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 60,
+ "type": "ApplyInstantID",
+ "pos": [
+ 910,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 266
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "instantid",
+ "type": "INSTANTID",
+ "link": 197
+ },
+ {
+ "name": "insightface",
+ "type": "FACEANALYSIS",
+ "link": 198
+ },
+ {
+ "name": "control_net",
+ "type": "CONTROL_NET",
+ "link": 199
+ },
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 214
+ },
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 206
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 203
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 204
+ },
+ {
+ "name": "image_kps",
+ "type": "IMAGE",
+ "link": 221
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 220
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "POSITIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "NEGATIVE",
+ "type": "CONDITIONING",
+ "links": [
+ 201
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ApplyInstantID"
+ },
+ "widgets_values": [
+ 0.8,
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 39,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 430
+ ],
+ "size": {
+ "0": 291.9967346191406,
+ "1": 128.62518310546875
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 122
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "comic character. graphic illustration, comic art, graphic novel art, vibrant, highly detailed"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 520,
+ 620
+ ],
+ "size": {
+ "0": 286.3603515625,
+ "1": 112.35245513916016
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 123
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 204
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "photograph, deformed, glitch, noisy, realistic, stock photo"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 70,
+ 520
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 206
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 122,
+ 123
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sdxl/AlbedoBaseXL.safetensors"
+ ]
+ },
+ {
+ "id": 13,
+ "type": "LoadImage",
+ "pos": [
+ 290,
+ 70
+ ],
+ "size": {
+ "0": 210,
+ "1": 314
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 214
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "joseph-gonzalez-iFgRcqHznqg-unsplash.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 67,
+ "type": "LoadImage",
+ "pos": [
+ 592,
+ 781
+ ],
+ "size": {
+ "0": 210,
+ "1": 314
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 221
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "miranda.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1300,
+ 210
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 220
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 201
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1631591431,
+ "fixed",
+ 30,
+ 4.5,
+ "ddpm",
+ "karras",
+ 1
+ ]
+ }
+ ],
+ "links": [
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ "LATENT"
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ "LATENT"
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 19,
+ 8,
+ 0,
+ 15,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 122,
+ 4,
+ 1,
+ 39,
+ 0,
+ "CLIP"
+ ],
+ [
+ 123,
+ 4,
+ 1,
+ 40,
+ 0,
+ "CLIP"
+ ],
+ [
+ 197,
+ 11,
+ 0,
+ 60,
+ 0,
+ "INSTANTID"
+ ],
+ [
+ 198,
+ 38,
+ 0,
+ 60,
+ 1,
+ "FACEANALYSIS"
+ ],
+ [
+ 199,
+ 16,
+ 0,
+ 60,
+ 2,
+ "CONTROL_NET"
+ ],
+ [
+ 200,
+ 60,
+ 1,
+ 3,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 201,
+ 60,
+ 2,
+ 3,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 203,
+ 39,
+ 0,
+ 60,
+ 5,
+ "CONDITIONING"
+ ],
+ [
+ 204,
+ 40,
+ 0,
+ 60,
+ 6,
+ "CONDITIONING"
+ ],
+ [
+ 206,
+ 4,
+ 0,
+ 60,
+ 4,
+ "MODEL"
+ ],
+ [
+ 214,
+ 13,
+ 0,
+ 60,
+ 3,
+ "IMAGE"
+ ],
+ [
+ 220,
+ 60,
+ 0,
+ 3,
+ 0,
+ "MODEL"
+ ],
+ [
+ 221,
+ 67,
+ 0,
+ 60,
+ 7,
+ "IMAGE"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/ComfyUI_InstantID/examples/daydreaming.jpg b/ComfyUI_InstantID/examples/daydreaming.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ab76903e79691c9b9730a0ac24fae694e72a3f34
Binary files /dev/null and b/ComfyUI_InstantID/examples/daydreaming.jpg differ
diff --git a/ComfyUI_InstantID/examples/instant_id_ipadapter.jpg b/ComfyUI_InstantID/examples/instant_id_ipadapter.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..583eb0d4049955cff0923e9d2d35d75a76708449
Binary files /dev/null and b/ComfyUI_InstantID/examples/instant_id_ipadapter.jpg differ
diff --git a/ComfyUI_InstantID/examples/instantid_basic_workflow.jpg b/ComfyUI_InstantID/examples/instantid_basic_workflow.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c21bcb93cbe479821e8b50a3a8345864b4957e56
Binary files /dev/null and b/ComfyUI_InstantID/examples/instantid_basic_workflow.jpg differ
diff --git a/ComfyUI_InstantID/examples/instantid_multi_id.jpg b/ComfyUI_InstantID/examples/instantid_multi_id.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3326c88cd406310bbfc031584f8c3b0d2729857f
Binary files /dev/null and b/ComfyUI_InstantID/examples/instantid_multi_id.jpg differ
diff --git a/ComfyUI_InstantID/pyproject.toml b/ComfyUI_InstantID/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..97ed0596ec568a3523439d2a90814b5a122ce728
--- /dev/null
+++ b/ComfyUI_InstantID/pyproject.toml
@@ -0,0 +1,15 @@
+[project]
+name = "comfyui_instantid"
+description = "Native InstantID support for ComfyUI. This extension differs from the many already available as it doesn't use diffusers but instead implements InstantID natively and it fully integrates with ComfyUI."
+version = "1.0.0"
+license = "LICENSE"
+dependencies = ["insightface", "onnxruntime", "onnxruntime-gpu"]
+
+[project.urls]
+Repository = "https://github.com/cubiq/ComfyUI_InstantID"
+# Used by Comfy Registry https://comfyregistry.org
+
+[tool.comfy]
+PublisherId = "matteo"
+DisplayName = "ComfyUI_InstantID"
+Icon = ""
diff --git a/ComfyUI_InstantID/requirements.txt b/ComfyUI_InstantID/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be801c35bf66d094d40392b0bbdfa8bfed9f332b
--- /dev/null
+++ b/ComfyUI_InstantID/requirements.txt
@@ -0,0 +1,3 @@
+insightface
+onnxruntime
+onnxruntime-gpu; sys_platform != 'darwin' and platform_machine == 'x86_64'
diff --git a/ComfyUI_InstantID/resampler.py b/ComfyUI_InstantID/resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1e84d81b9a52d0180a987585ae6c39ec38c0ba
--- /dev/null
+++ b/ComfyUI_InstantID/resampler.py
@@ -0,0 +1,121 @@
+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+import math
+
+import torch
+import torch.nn as nn
+
+
+# FFN
+def FeedForward(dim, mult=4):
+ inner_dim = int(dim * mult)
+ return nn.Sequential(
+ nn.LayerNorm(dim),
+ nn.Linear(dim, inner_dim, bias=False),
+ nn.GELU(),
+ nn.Linear(inner_dim, dim, bias=False),
+ )
+
+
+def reshape_tensor(x, heads):
+ bs, length, width = x.shape
+ #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+ x = x.view(bs, length, heads, -1)
+ # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+ x = x.transpose(1, 2)
+ # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+ x = x.reshape(bs, heads, length, -1)
+ return x
+
+
+class PerceiverAttention(nn.Module):
+ def __init__(self, *, dim, dim_head=64, heads=8):
+ super().__init__()
+ self.scale = dim_head**-0.5
+ self.dim_head = dim_head
+ self.heads = heads
+ inner_dim = dim_head * heads
+
+ self.norm1 = nn.LayerNorm(dim)
+ self.norm2 = nn.LayerNorm(dim)
+
+ self.to_q = nn.Linear(dim, inner_dim, bias=False)
+ self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+ self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+
+ def forward(self, x, latents):
+ """
+ Args:
+ x (torch.Tensor): image features
+ shape (b, n1, D)
+ latent (torch.Tensor): latent features
+ shape (b, n2, D)
+ """
+ x = self.norm1(x)
+ latents = self.norm2(latents)
+
+ b, l, _ = latents.shape
+
+ q = self.to_q(latents)
+ kv_input = torch.cat((x, latents), dim=-2)
+ k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+ q = reshape_tensor(q, self.heads)
+ k = reshape_tensor(k, self.heads)
+ v = reshape_tensor(v, self.heads)
+
+ # attention
+ scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+ weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+ out = weight @ v
+
+ out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+ return self.to_out(out)
+
+
+class Resampler(nn.Module):
+ def __init__(
+ self,
+ dim=1024,
+ depth=8,
+ dim_head=64,
+ heads=16,
+ num_queries=8,
+ embedding_dim=768,
+ output_dim=1024,
+ ff_mult=4,
+ ):
+ super().__init__()
+
+ self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+
+ self.proj_in = nn.Linear(embedding_dim, dim)
+
+ self.proj_out = nn.Linear(dim, output_dim)
+ self.norm_out = nn.LayerNorm(output_dim)
+
+ self.layers = nn.ModuleList([])
+ for _ in range(depth):
+ self.layers.append(
+ nn.ModuleList(
+ [
+ PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+ FeedForward(dim=dim, mult=ff_mult),
+ ]
+ )
+ )
+
+ def forward(self, x):
+
+ latents = self.latents.repeat(x.size(0), 1, 1)
+
+ x = self.proj_in(x)
+
+ for attn, ff in self.layers:
+ latents = attn(x, latents) + latents
+ latents = ff(latents) + latents
+
+ latents = self.proj_out(latents)
+ return self.norm_out(latents)
\ No newline at end of file
diff --git a/ComfyUI_InstantID/utils.py b/ComfyUI_InstantID/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5bd25db984c1bf86f96de0a5d93a8c71f8b8e40
--- /dev/null
+++ b/ComfyUI_InstantID/utils.py
@@ -0,0 +1,24 @@
+import torch
+
+def tensor_to_size(source, dest_size):
+ if isinstance(dest_size, torch.Tensor):
+ dest_size = dest_size.shape[0]
+ source_size = source.shape[0]
+
+ if source_size < dest_size:
+ shape = [dest_size - source_size] + [1]*(source.dim()-1)
+ source = torch.cat((source, source[-1:].repeat(shape)), dim=0)
+ elif source_size > dest_size:
+ source = source[:dest_size]
+
+ return source
+
+def tensor_to_image(tensor):
+ image = tensor.mul(255).clamp(0, 255).byte().cpu()
+ image = image[..., [2, 1, 0]].numpy()
+ return image
+
+def image_to_tensor(image):
+ tensor = torch.clamp(torch.from_numpy(image).float() / 255., 0, 1)
+ tensor = tensor[..., [2, 1, 0]]
+ return tensor
diff --git a/ComfyUI_essentials/.github/workflows/publish.yml b/ComfyUI_essentials/.github/workflows/publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ed74f76326434cd00098ec30d1d74c4c0a6e0533
--- /dev/null
+++ b/ComfyUI_essentials/.github/workflows/publish.yml
@@ -0,0 +1,22 @@
+name: Publish to Comfy registry
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ - master
+ paths:
+ - "pyproject.toml"
+
+jobs:
+ publish-node:
+ name: Publish Custom Node to registry
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out code
+ uses: actions/checkout@v4
+ - name: Publish Custom Node
+ uses: Comfy-Org/publish-node-action@main
+ with:
+ ## Add your own personal access token to your Github Repository secrets and reference it here.
+ personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
diff --git a/ComfyUI_essentials/.gitignore b/ComfyUI_essentials/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cfa8e47419e74febb8473096c3f70d20007d9606
--- /dev/null
+++ b/ComfyUI_essentials/.gitignore
@@ -0,0 +1,6 @@
+/__pycache__/
+/luts/*.cube
+/luts/*.CUBE
+/fonts/*.ttf
+/fonts/*.otf
+!/fonts/ShareTechMono-Regular.ttf
\ No newline at end of file
diff --git a/ComfyUI_essentials/LICENSE b/ComfyUI_essentials/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2beba248f4c5767434b76b9e0b2f9d63eb073623
--- /dev/null
+++ b/ComfyUI_essentials/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Matteo Spinelli
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/ComfyUI_essentials/README.md b/ComfyUI_essentials/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..056f999eeb15a3ef1dabf2cf1e2aa0da411c9fdb
--- /dev/null
+++ b/ComfyUI_essentials/README.md
@@ -0,0 +1,49 @@
+# :wrench: ComfyUI Essentials
+
+Essential nodes that are weirdly missing from ComfyUI core. With few exceptions they are new features and not commodities. I hope this will be just a temporary repository until the nodes get included into ComfyUI.
+
+# Sponsorship
+
+
+
+**[:heart: Github Sponsor](https://github.com/sponsors/cubiq) | [:coin: Paypal](https://paypal.me/matt3o)**
+
+
+
+If you like my work and wish to see updates and new features please consider sponsoring my projects.
+
+- [ComfyUI IPAdapter Plus](https://github.com/cubiq/ComfyUI_IPAdapter_plus)
+- [ComfyUI InstantID (Native)](https://github.com/cubiq/ComfyUI_InstantID)
+- [ComfyUI Essentials](https://github.com/cubiq/ComfyUI_essentials)
+- [ComfyUI FaceAnalysis](https://github.com/cubiq/ComfyUI_FaceAnalysis)
+
+Not to mention the documentation and videos tutorials. Check my **ComfyUI Advanced Understanding** videos on YouTube for example, [part 1](https://www.youtube.com/watch?v=_C7kR2TFIX0) and [part 2](https://www.youtube.com/watch?v=ijqXnW_9gzc)
+
+The only way to keep the code open and free is by sponsoring its development. The more sponsorships the more time I can dedicate to my open source projects.
+
+Please consider a [Github Sponsorship](https://github.com/sponsors/cubiq) or [PayPal donation](https://paypal.me/matt3o) (Matteo "matt3o" Spinelli). For sponsorships of $50+, let me know if you'd like to be mentioned in this readme file, you can find me on [Discord](https://latent.vision/discord) or _matt3o :snail: gmail.com_.
+
+## Current sponsors
+
+It's only thanks to generous sponsors that **the whole community** can enjoy open and free software. Please join me in thanking the following companies and individuals!
+
+### :trophy: Gold sponsors
+
+[![Kaiber.ai](https://f.latent.vision/imgs/kaiber.png)](https://kaiber.ai/) [![InstaSD](https://f.latent.vision/imgs/instasd.png)](https://www.instasd.com/)
+
+### :tada: Silver sponsors
+
+[![OperArt.ai](https://f.latent.vision/imgs/openart.png?r=1)](https://openart.ai/workflows) [![Finetuners](https://f.latent.vision/imgs/finetuners.png)](https://www.finetuners.ai/) [![Comfy.ICU](https://f.latent.vision/imgs/comfyicu.png?r=1)](https://comfy.icu/)
+
+### Other companies supporting my projects
+
+- [RunComfy](https://www.runcomfy.com/) (ComfyUI Cloud)
+
+### Esteemed individuals
+
+- [Øystein Ø. Olsen](https://github.com/FireNeslo)
+- [Jack Gane](https://github.com/ganeJackS)
+- [Nathan Shipley](https://www.nathanshipley.com/)
+- [Dkdnzia](https://github.com/Dkdnzia)
+
+[And all my public and private sponsors!](https://github.com/sponsors/cubiq)
\ No newline at end of file
diff --git a/ComfyUI_essentials/__init__.py b/ComfyUI_essentials/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0658b4e7f8ed43d62cb3a4d81052e821bb7e06
--- /dev/null
+++ b/ComfyUI_essentials/__init__.py
@@ -0,0 +1,36 @@
+#from .essentials import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
+from .image import IMAGE_CLASS_MAPPINGS, IMAGE_NAME_MAPPINGS
+from .mask import MASK_CLASS_MAPPINGS, MASK_NAME_MAPPINGS
+from .sampling import SAMPLING_CLASS_MAPPINGS, SAMPLING_NAME_MAPPINGS
+from .segmentation import SEG_CLASS_MAPPINGS, SEG_NAME_MAPPINGS
+from .misc import MISC_CLASS_MAPPINGS, MISC_NAME_MAPPINGS
+from .conditioning import COND_CLASS_MAPPINGS, COND_NAME_MAPPINGS
+from .text import TEXT_CLASS_MAPPINGS, TEXT_NAME_MAPPINGS
+
+WEB_DIRECTORY = "./js"
+
+NODE_CLASS_MAPPINGS = {}
+NODE_DISPLAY_NAME_MAPPINGS = {}
+
+NODE_CLASS_MAPPINGS.update(COND_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(COND_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS.update(IMAGE_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(IMAGE_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS.update(MASK_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(MASK_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS.update(SAMPLING_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(SAMPLING_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS.update(SEG_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(SEG_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS.update(TEXT_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(TEXT_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS.update(MISC_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(MISC_NAME_MAPPINGS)
+
+__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS', "WEB_DIRECTORY"]
diff --git a/ComfyUI_essentials/__pycache__/__init__.cpython-312.pyc b/ComfyUI_essentials/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2bcd08874ee92c4d23112498b8ae223aad11161
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/__init__.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/conditioning.cpython-312.pyc b/ComfyUI_essentials/__pycache__/conditioning.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ba4d4cdc1438c1e93b35bd68f085bbffa0d72df
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/conditioning.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/image.cpython-312.pyc b/ComfyUI_essentials/__pycache__/image.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71d200288e397912a86a27ff5c577d9ad3fd1b32
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/image.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/mask.cpython-312.pyc b/ComfyUI_essentials/__pycache__/mask.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..397b2b51852fe111084d1e9047eb7d1ec03666ae
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/mask.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/misc.cpython-312.pyc b/ComfyUI_essentials/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c4f3c402becb9a0a99a94e60d0ffdccd570884b
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/misc.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/sampling.cpython-312.pyc b/ComfyUI_essentials/__pycache__/sampling.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74f7133b627a80ec9a07574f6c1baeb69ecbef91
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/sampling.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/segmentation.cpython-312.pyc b/ComfyUI_essentials/__pycache__/segmentation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9be7c39820faba7adb6ea9dd7ba3a2eae324e53d
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/segmentation.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/text.cpython-312.pyc b/ComfyUI_essentials/__pycache__/text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef0ee384ea681d2c77ed1d1609621d1095316664
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/text.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/__pycache__/utils.cpython-312.pyc b/ComfyUI_essentials/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd68b24711027462708061098a8590c8f04be687
Binary files /dev/null and b/ComfyUI_essentials/__pycache__/utils.cpython-312.pyc differ
diff --git a/ComfyUI_essentials/carve.py b/ComfyUI_essentials/carve.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead61a0661cc608f420ab74753d6da9f53461d06
--- /dev/null
+++ b/ComfyUI_essentials/carve.py
@@ -0,0 +1,454 @@
+# MIT licensed code from https://github.com/li-plus/seam-carving/
+
+from enum import Enum
+from typing import Optional, Tuple
+
+import numba as nb
+import numpy as np
+from scipy.ndimage import sobel
+
+DROP_MASK_ENERGY = 1e5
+KEEP_MASK_ENERGY = 1e3
+
+
+class OrderMode(str, Enum):
+ WIDTH_FIRST = "width-first"
+ HEIGHT_FIRST = "height-first"
+
+
+class EnergyMode(str, Enum):
+ FORWARD = "forward"
+ BACKWARD = "backward"
+
+
+def _list_enum(enum_class) -> Tuple:
+ return tuple(x.value for x in enum_class)
+
+
+def _rgb2gray(rgb: np.ndarray) -> np.ndarray:
+ """Convert an RGB image to a grayscale image"""
+ coeffs = np.array([0.2125, 0.7154, 0.0721], dtype=np.float32)
+ return (rgb @ coeffs).astype(rgb.dtype)
+
+
+def _get_seam_mask(src: np.ndarray, seam: np.ndarray) -> np.ndarray:
+ """Convert a list of seam column indices to a mask"""
+ return np.eye(src.shape[1], dtype=bool)[seam]
+
+
+def _remove_seam_mask(src: np.ndarray, seam_mask: np.ndarray) -> np.ndarray:
+ """Remove a seam from the source image according to the given seam_mask"""
+ if src.ndim == 3:
+ h, w, c = src.shape
+ seam_mask = np.broadcast_to(seam_mask[:, :, None], src.shape)
+ dst = src[~seam_mask].reshape((h, w - 1, c))
+ else:
+ h, w = src.shape
+ dst = src[~seam_mask].reshape((h, w - 1))
+ return dst
+
+
+def _get_energy(gray: np.ndarray) -> np.ndarray:
+ """Get backward energy map from the source image"""
+ assert gray.ndim == 2
+
+ gray = gray.astype(np.float32)
+ grad_x = sobel(gray, axis=1)
+ grad_y = sobel(gray, axis=0)
+ energy = np.abs(grad_x) + np.abs(grad_y)
+ return energy
+
+
+@nb.njit(nb.int32[:](nb.float32[:, :]), cache=True)
+def _get_backward_seam(energy: np.ndarray) -> np.ndarray:
+ """Compute the minimum vertical seam from the backward energy map"""
+ h, w = energy.shape
+ inf = np.array([np.inf], dtype=np.float32)
+ cost = np.concatenate((inf, energy[0], inf))
+ parent = np.empty((h, w), dtype=np.int32)
+ base_idx = np.arange(-1, w - 1, dtype=np.int32)
+
+ for r in range(1, h):
+ choices = np.vstack((cost[:-2], cost[1:-1], cost[2:]))
+ min_idx = np.argmin(choices, axis=0) + base_idx
+ parent[r] = min_idx
+ cost[1:-1] = cost[1:-1][min_idx] + energy[r]
+
+ c = np.argmin(cost[1:-1])
+ seam = np.empty(h, dtype=np.int32)
+ for r in range(h - 1, -1, -1):
+ seam[r] = c
+ c = parent[r, c]
+
+ return seam
+
+
+def _get_backward_seams(
+ gray: np.ndarray, num_seams: int, aux_energy: Optional[np.ndarray]
+) -> np.ndarray:
+ """Compute the minimum N vertical seams using backward energy"""
+ h, w = gray.shape
+ seams = np.zeros((h, w), dtype=bool)
+ rows = np.arange(h, dtype=np.int32)
+ idx_map = np.broadcast_to(np.arange(w, dtype=np.int32), (h, w))
+ energy = _get_energy(gray)
+ if aux_energy is not None:
+ energy += aux_energy
+ for _ in range(num_seams):
+ seam = _get_backward_seam(energy)
+ seams[rows, idx_map[rows, seam]] = True
+
+ seam_mask = _get_seam_mask(gray, seam)
+ gray = _remove_seam_mask(gray, seam_mask)
+ idx_map = _remove_seam_mask(idx_map, seam_mask)
+ if aux_energy is not None:
+ aux_energy = _remove_seam_mask(aux_energy, seam_mask)
+
+ # Only need to re-compute the energy in the bounding box of the seam
+ _, cur_w = energy.shape
+ lo = max(0, np.min(seam) - 1)
+ hi = min(cur_w, np.max(seam) + 1)
+ pad_lo = 1 if lo > 0 else 0
+ pad_hi = 1 if hi < cur_w - 1 else 0
+ mid_block = gray[:, lo - pad_lo : hi + pad_hi]
+ _, mid_w = mid_block.shape
+ mid_energy = _get_energy(mid_block)[:, pad_lo : mid_w - pad_hi]
+ if aux_energy is not None:
+ mid_energy += aux_energy[:, lo:hi]
+ energy = np.hstack((energy[:, :lo], mid_energy, energy[:, hi + 1 :]))
+
+ return seams
+
+
+@nb.njit(
+ [
+ nb.int32[:](nb.float32[:, :], nb.none),
+ nb.int32[:](nb.float32[:, :], nb.float32[:, :]),
+ ],
+ cache=True,
+)
+def _get_forward_seam(gray: np.ndarray, aux_energy: Optional[np.ndarray]) -> np.ndarray:
+ """Compute the minimum vertical seam using forward energy"""
+ h, w = gray.shape
+
+ gray = np.hstack((gray[:, :1], gray, gray[:, -1:]))
+
+ inf = np.array([np.inf], dtype=np.float32)
+ dp = np.concatenate((inf, np.abs(gray[0, 2:] - gray[0, :-2]), inf))
+
+ parent = np.empty((h, w), dtype=np.int32)
+ base_idx = np.arange(-1, w - 1, dtype=np.int32)
+
+ inf = np.array([np.inf], dtype=np.float32)
+ for r in range(1, h):
+ curr_shl = gray[r, 2:]
+ curr_shr = gray[r, :-2]
+ cost_mid = np.abs(curr_shl - curr_shr)
+ if aux_energy is not None:
+ cost_mid += aux_energy[r]
+
+ prev_mid = gray[r - 1, 1:-1]
+ cost_left = cost_mid + np.abs(prev_mid - curr_shr)
+ cost_right = cost_mid + np.abs(prev_mid - curr_shl)
+
+ dp_mid = dp[1:-1]
+ dp_left = dp[:-2]
+ dp_right = dp[2:]
+
+ choices = np.vstack(
+ (cost_left + dp_left, cost_mid + dp_mid, cost_right + dp_right)
+ )
+ min_idx = np.argmin(choices, axis=0)
+ parent[r] = min_idx + base_idx
+ # numba does not support specifying axis in np.min, below loop is equivalent to:
+ # `dp_mid[:] = np.min(choices, axis=0)` or `dp_mid[:] = choices[min_idx, np.arange(w)]`
+ for j, i in enumerate(min_idx):
+ dp_mid[j] = choices[i, j]
+
+ c = np.argmin(dp[1:-1])
+ seam = np.empty(h, dtype=np.int32)
+ for r in range(h - 1, -1, -1):
+ seam[r] = c
+ c = parent[r, c]
+
+ return seam
+
+
+def _get_forward_seams(
+ gray: np.ndarray, num_seams: int, aux_energy: Optional[np.ndarray]
+) -> np.ndarray:
+ """Compute minimum N vertical seams using forward energy"""
+ h, w = gray.shape
+ seams = np.zeros((h, w), dtype=bool)
+ rows = np.arange(h, dtype=np.int32)
+ idx_map = np.broadcast_to(np.arange(w, dtype=np.int32), (h, w))
+ for _ in range(num_seams):
+ seam = _get_forward_seam(gray, aux_energy)
+ seams[rows, idx_map[rows, seam]] = True
+ seam_mask = _get_seam_mask(gray, seam)
+ gray = _remove_seam_mask(gray, seam_mask)
+ idx_map = _remove_seam_mask(idx_map, seam_mask)
+ if aux_energy is not None:
+ aux_energy = _remove_seam_mask(aux_energy, seam_mask)
+
+ return seams
+
+
+def _get_seams(
+ gray: np.ndarray, num_seams: int, energy_mode: str, aux_energy: Optional[np.ndarray]
+) -> np.ndarray:
+ """Get the minimum N seams from the grayscale image"""
+ gray = np.asarray(gray, dtype=np.float32)
+ if energy_mode == EnergyMode.BACKWARD:
+ return _get_backward_seams(gray, num_seams, aux_energy)
+ elif energy_mode == EnergyMode.FORWARD:
+ return _get_forward_seams(gray, num_seams, aux_energy)
+ else:
+ raise ValueError(
+ f"expect energy_mode to be one of {_list_enum(EnergyMode)}, got {energy_mode}"
+ )
+
+
+def _reduce_width(
+ src: np.ndarray,
+ delta_width: int,
+ energy_mode: str,
+ aux_energy: Optional[np.ndarray],
+) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+ """Reduce the width of image by delta_width pixels"""
+ assert src.ndim in (2, 3) and delta_width >= 0
+ if src.ndim == 2:
+ gray = src
+ src_h, src_w = src.shape
+ dst_shape: Tuple[int, ...] = (src_h, src_w - delta_width)
+ else:
+ gray = _rgb2gray(src)
+ src_h, src_w, src_c = src.shape
+ dst_shape = (src_h, src_w - delta_width, src_c)
+
+ to_keep = ~_get_seams(gray, delta_width, energy_mode, aux_energy)
+ dst = src[to_keep].reshape(dst_shape)
+ if aux_energy is not None:
+ aux_energy = aux_energy[to_keep].reshape(dst_shape[:2])
+ return dst, aux_energy
+
+
+@nb.njit(
+ nb.float32[:, :, :](nb.float32[:, :, :], nb.boolean[:, :], nb.int32), cache=True
+)
+def _insert_seams_kernel(
+ src: np.ndarray, seams: np.ndarray, delta_width: int
+) -> np.ndarray:
+ """The numba kernel for inserting seams"""
+ src_h, src_w, src_c = src.shape
+ dst = np.empty((src_h, src_w + delta_width, src_c), dtype=src.dtype)
+ for row in range(src_h):
+ dst_col = 0
+ for src_col in range(src_w):
+ if seams[row, src_col]:
+ left = src[row, max(src_col - 1, 0)]
+ right = src[row, src_col]
+ dst[row, dst_col] = (left + right) / 2
+ dst_col += 1
+ dst[row, dst_col] = src[row, src_col]
+ dst_col += 1
+ return dst
+
+
+def _insert_seams(src: np.ndarray, seams: np.ndarray, delta_width: int) -> np.ndarray:
+ """Insert multiple seams into the source image"""
+ dst = src.astype(np.float32)
+ if dst.ndim == 2:
+ dst = dst[:, :, None]
+ dst = _insert_seams_kernel(dst, seams, delta_width).astype(src.dtype)
+ if src.ndim == 2:
+ dst = dst.squeeze(-1)
+ return dst
+
+
+def _expand_width(
+ src: np.ndarray,
+ delta_width: int,
+ energy_mode: str,
+ aux_energy: Optional[np.ndarray],
+ step_ratio: float,
+) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+ """Expand the width of image by delta_width pixels"""
+ assert src.ndim in (2, 3) and delta_width >= 0
+ if not 0 < step_ratio <= 1:
+ raise ValueError(f"expect `step_ratio` to be between (0,1], got {step_ratio}")
+
+ dst = src
+ while delta_width > 0:
+ max_step_size = max(1, round(step_ratio * dst.shape[1]))
+ step_size = min(max_step_size, delta_width)
+ gray = dst if dst.ndim == 2 else _rgb2gray(dst)
+ seams = _get_seams(gray, step_size, energy_mode, aux_energy)
+ dst = _insert_seams(dst, seams, step_size)
+ if aux_energy is not None:
+ aux_energy = _insert_seams(aux_energy, seams, step_size)
+ delta_width -= step_size
+
+ return dst, aux_energy
+
+
+def _resize_width(
+ src: np.ndarray,
+ width: int,
+ energy_mode: str,
+ aux_energy: Optional[np.ndarray],
+ step_ratio: float,
+) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+ """Resize the width of image by removing vertical seams"""
+ assert src.size > 0 and src.ndim in (2, 3)
+ assert width > 0
+
+ src_w = src.shape[1]
+ if src_w < width:
+ dst, aux_energy = _expand_width(
+ src, width - src_w, energy_mode, aux_energy, step_ratio
+ )
+ else:
+ dst, aux_energy = _reduce_width(src, src_w - width, energy_mode, aux_energy)
+ return dst, aux_energy
+
+
+def _transpose_image(src: np.ndarray) -> np.ndarray:
+ """Transpose a source image in rgb or grayscale format"""
+ if src.ndim == 3:
+ dst = src.transpose((1, 0, 2))
+ else:
+ dst = src.T
+ return dst
+
+
+def _resize_height(
+ src: np.ndarray,
+ height: int,
+ energy_mode: str,
+ aux_energy: Optional[np.ndarray],
+ step_ratio: float,
+) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+ """Resize the height of image by removing horizontal seams"""
+ assert src.ndim in (2, 3) and height > 0
+ if aux_energy is not None:
+ aux_energy = aux_energy.T
+ src = _transpose_image(src)
+ src, aux_energy = _resize_width(src, height, energy_mode, aux_energy, step_ratio)
+ src = _transpose_image(src)
+ if aux_energy is not None:
+ aux_energy = aux_energy.T
+ return src, aux_energy
+
+
+def _check_mask(mask: np.ndarray, shape: Tuple[int, ...]) -> np.ndarray:
+ """Ensure the mask to be a 2D grayscale map of specific shape"""
+ mask = np.asarray(mask, dtype=bool)
+ if mask.ndim != 2:
+ raise ValueError(f"expect mask to be a 2d binary map, got shape {mask.shape}")
+ if mask.shape != shape:
+ raise ValueError(
+ f"expect the shape of mask to match the image, got {mask.shape} vs {shape}"
+ )
+ return mask
+
+
+def _check_src(src: np.ndarray) -> np.ndarray:
+ """Ensure the source to be RGB or grayscale"""
+ src = np.asarray(src)
+ if src.size == 0 or src.ndim not in (2, 3):
+ raise ValueError(
+ f"expect a 3d rgb image or a 2d grayscale image, got image in shape {src.shape}"
+ )
+ return src
+
+
+def seam_carving(
+ src: np.ndarray,
+ size: Optional[Tuple[int, int]] = None,
+ energy_mode: str = "backward",
+ order: str = "width-first",
+ keep_mask: Optional[np.ndarray] = None,
+ drop_mask: Optional[np.ndarray] = None,
+ step_ratio: float = 0.5,
+) -> np.ndarray:
+ """Resize the image using the content-aware seam-carving algorithm.
+
+ :param src: A source image in RGB or grayscale format.
+ :param size: The target size in pixels, as a 2-tuple (width, height).
+ :param energy_mode: Policy to compute energy for the source image. Could be
+ one of ``backward`` or ``forward``. If ``backward``, compute the energy
+ as the gradient at each pixel. If ``forward``, compute the energy as the
+ distances between adjacent pixels after each pixel is removed.
+ :param order: The order to remove horizontal and vertical seams. Could be
+ one of ``width-first`` or ``height-first``. In ``width-first`` mode, we
+ remove or insert all vertical seams first, then the horizontal ones,
+ while ``height-first`` is the opposite.
+ :param keep_mask: An optional mask where the foreground is protected from
+ seam removal. If not specified, no area will be protected.
+ :param drop_mask: An optional binary object mask to remove. If given, the
+ object will be removed before resizing the image to the target size.
+ :param step_ratio: The maximum size expansion ratio in one seam carving step.
+ The image will be expanded in multiple steps if target size is too large.
+ :return: A resized copy of the source image.
+ """
+ src = _check_src(src)
+
+ if order not in _list_enum(OrderMode):
+ raise ValueError(
+ f"expect order to be one of {_list_enum(OrderMode)}, got {order}"
+ )
+
+ aux_energy = None
+
+ if keep_mask is not None:
+ keep_mask = _check_mask(keep_mask, src.shape[:2])
+
+ aux_energy = np.zeros(src.shape[:2], dtype=np.float32)
+ aux_energy[keep_mask] += KEEP_MASK_ENERGY
+
+ # remove object if `drop_mask` is given
+ if drop_mask is not None:
+ drop_mask = _check_mask(drop_mask, src.shape[:2])
+
+ if aux_energy is None:
+ aux_energy = np.zeros(src.shape[:2], dtype=np.float32)
+ aux_energy[drop_mask] -= DROP_MASK_ENERGY
+
+ if order == OrderMode.HEIGHT_FIRST:
+ src = _transpose_image(src)
+ aux_energy = aux_energy.T
+
+ num_seams = (aux_energy < 0).sum(1).max()
+ while num_seams > 0:
+ src, aux_energy = _reduce_width(src, num_seams, energy_mode, aux_energy)
+ num_seams = (aux_energy < 0).sum(1).max()
+
+ if order == OrderMode.HEIGHT_FIRST:
+ src = _transpose_image(src)
+ aux_energy = aux_energy.T
+
+ # resize image if `size` is given
+ if size is not None:
+ width, height = size
+ width = round(width)
+ height = round(height)
+ if width <= 0 or height <= 0:
+ raise ValueError(f"expect target size to be positive, got {size}")
+
+ if order == OrderMode.WIDTH_FIRST:
+ src, aux_energy = _resize_width(
+ src, width, energy_mode, aux_energy, step_ratio
+ )
+ src, aux_energy = _resize_height(
+ src, height, energy_mode, aux_energy, step_ratio
+ )
+ else:
+ src, aux_energy = _resize_height(
+ src, height, energy_mode, aux_energy, step_ratio
+ )
+ src, aux_energy = _resize_width(
+ src, width, energy_mode, aux_energy, step_ratio
+ )
+
+ return src
diff --git a/ComfyUI_essentials/conditioning.py b/ComfyUI_essentials/conditioning.py
new file mode 100644
index 0000000000000000000000000000000000000000..67cc356e944b30f52d237b40dce3bc9b99c1384f
--- /dev/null
+++ b/ComfyUI_essentials/conditioning.py
@@ -0,0 +1,280 @@
+from nodes import MAX_RESOLUTION, ConditioningZeroOut, ConditioningSetTimestepRange, ConditioningCombine
+import re
+
+class CLIPTextEncodeSDXLSimplified:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}),
+ "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}),
+ "size_cond_factor": ("INT", {"default": 4, "min": 1, "max": 16 }),
+ "text": ("STRING", {"multiline": True, "dynamicPrompts": True, "default": ""}),
+ "clip": ("CLIP", ),
+ }}
+ RETURN_TYPES = ("CONDITIONING",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/conditioning"
+
+ def execute(self, clip, width, height, size_cond_factor, text):
+ crop_w = 0
+ crop_h = 0
+ width = width*size_cond_factor
+ height = height*size_cond_factor
+ target_width = width
+ target_height = height
+ text_g = text_l = text
+
+ tokens = clip.tokenize(text_g)
+ tokens["l"] = clip.tokenize(text_l)["l"]
+ if len(tokens["l"]) != len(tokens["g"]):
+ empty = clip.tokenize("")
+ while len(tokens["l"]) < len(tokens["g"]):
+ tokens["l"] += empty["l"]
+ while len(tokens["l"]) > len(tokens["g"]):
+ tokens["g"] += empty["g"]
+ cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)
+ return ([[cond, {"pooled_output": pooled, "width": width, "height": height, "crop_w": crop_w, "crop_h": crop_h, "target_width": target_width, "target_height": target_height}]], )
+
+class ConditioningCombineMultiple:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "conditioning_1": ("CONDITIONING",),
+ "conditioning_2": ("CONDITIONING",),
+ }, "optional": {
+ "conditioning_3": ("CONDITIONING",),
+ "conditioning_4": ("CONDITIONING",),
+ "conditioning_5": ("CONDITIONING",),
+ },
+ }
+ RETURN_TYPES = ("CONDITIONING",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/conditioning"
+
+ def execute(self, conditioning_1, conditioning_2, conditioning_3=None, conditioning_4=None, conditioning_5=None):
+ c = conditioning_1 + conditioning_2
+
+ if conditioning_3 is not None:
+ c += conditioning_3
+ if conditioning_4 is not None:
+ c += conditioning_4
+ if conditioning_5 is not None:
+ c += conditioning_5
+
+ return (c,)
+
+class SD3NegativeConditioning:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "conditioning": ("CONDITIONING",),
+ "end": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 1.0, "step": 0.001 }),
+ }}
+ RETURN_TYPES = ("CONDITIONING",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/conditioning"
+
+ def execute(self, conditioning, end):
+ zero_c = ConditioningZeroOut().zero_out(conditioning)[0]
+
+ if end == 0:
+ return (zero_c, )
+
+ c = ConditioningSetTimestepRange().set_range(conditioning, 0, end)[0]
+ zero_c = ConditioningSetTimestepRange().set_range(zero_c, end, 1.0)[0]
+ c = ConditioningCombine().combine(zero_c, c)[0]
+
+ return (c, )
+
+class FluxAttentionSeeker:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "clip": ("CLIP",),
+ "apply_to_query": ("BOOLEAN", { "default": True }),
+ "apply_to_key": ("BOOLEAN", { "default": True }),
+ "apply_to_value": ("BOOLEAN", { "default": True }),
+ "apply_to_out": ("BOOLEAN", { "default": True }),
+ **{f"clip_l_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(12)},
+ **{f"t5xxl_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(24)},
+ }}
+
+ RETURN_TYPES = ("CLIP",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/conditioning"
+
+ def execute(self, clip, apply_to_query, apply_to_key, apply_to_value, apply_to_out, **values):
+ if not apply_to_key and not apply_to_query and not apply_to_value and not apply_to_out:
+ return (clip, )
+
+ m = clip.clone()
+ sd = m.patcher.model_state_dict()
+
+ for k in sd:
+ if "self_attn" in k:
+ layer = re.search(r"\.layers\.(\d+)\.", k)
+ layer = int(layer.group(1)) if layer else None
+
+ if layer is not None and values[f"clip_l_{layer}"] != 1.0:
+ if (apply_to_query and "q_proj" in k) or (apply_to_key and "k_proj" in k) or (apply_to_value and "v_proj" in k) or (apply_to_out and "out_proj" in k):
+ m.add_patches({k: (None,)}, 0.0, values[f"clip_l_{layer}"])
+ elif "SelfAttention" in k:
+ block = re.search(r"\.block\.(\d+)\.", k)
+ block = int(block.group(1)) if block else None
+
+ if block is not None and values[f"t5xxl_{block}"] != 1.0:
+ if (apply_to_query and ".q." in k) or (apply_to_key and ".k." in k) or (apply_to_value and ".v." in k) or (apply_to_out and ".o." in k):
+ m.add_patches({k: (None,)}, 0.0, values[f"t5xxl_{block}"])
+
+ return (m, )
+
+class SD3AttentionSeekerLG:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "clip": ("CLIP",),
+ "apply_to_query": ("BOOLEAN", { "default": True }),
+ "apply_to_key": ("BOOLEAN", { "default": True }),
+ "apply_to_value": ("BOOLEAN", { "default": True }),
+ "apply_to_out": ("BOOLEAN", { "default": True }),
+ **{f"clip_l_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(12)},
+ **{f"clip_g_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(32)},
+ }}
+
+ RETURN_TYPES = ("CLIP",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/conditioning"
+
+ def execute(self, clip, apply_to_query, apply_to_key, apply_to_value, apply_to_out, **values):
+ if not apply_to_key and not apply_to_query and not apply_to_value and not apply_to_out:
+ return (clip, )
+
+ m = clip.clone()
+ sd = m.patcher.model_state_dict()
+
+ for k in sd:
+ if "self_attn" in k:
+ layer = re.search(r"\.layers\.(\d+)\.", k)
+ layer = int(layer.group(1)) if layer else None
+
+ if layer is not None:
+ if "clip_l" in k and values[f"clip_l_{layer}"] != 1.0:
+ if (apply_to_query and "q_proj" in k) or (apply_to_key and "k_proj" in k) or (apply_to_value and "v_proj" in k) or (apply_to_out and "out_proj" in k):
+ m.add_patches({k: (None,)}, 0.0, values[f"clip_l_{layer}"])
+ elif "clip_g" in k and values[f"clip_g_{layer}"] != 1.0:
+ if (apply_to_query and "q_proj" in k) or (apply_to_key and "k_proj" in k) or (apply_to_value and "v_proj" in k) or (apply_to_out and "out_proj" in k):
+ m.add_patches({k: (None,)}, 0.0, values[f"clip_g_{layer}"])
+
+ return (m, )
+
+class SD3AttentionSeekerT5:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "clip": ("CLIP",),
+ "apply_to_query": ("BOOLEAN", { "default": True }),
+ "apply_to_key": ("BOOLEAN", { "default": True }),
+ "apply_to_value": ("BOOLEAN", { "default": True }),
+ "apply_to_out": ("BOOLEAN", { "default": True }),
+ **{f"t5xxl_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(24)},
+ }}
+
+ RETURN_TYPES = ("CLIP",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/conditioning"
+
+ def execute(self, clip, apply_to_query, apply_to_key, apply_to_value, apply_to_out, **values):
+ if not apply_to_key and not apply_to_query and not apply_to_value and not apply_to_out:
+ return (clip, )
+
+ m = clip.clone()
+ sd = m.patcher.model_state_dict()
+
+ for k in sd:
+ if "SelfAttention" in k:
+ block = re.search(r"\.block\.(\d+)\.", k)
+ block = int(block.group(1)) if block else None
+
+ if block is not None and values[f"t5xxl_{block}"] != 1.0:
+ if (apply_to_query and ".q." in k) or (apply_to_key and ".k." in k) or (apply_to_value and ".v." in k) or (apply_to_out and ".o." in k):
+ m.add_patches({k: (None,)}, 0.0, values[f"t5xxl_{block}"])
+
+ return (m, )
+
+class FluxBlocksBuster:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "model": ("MODEL",),
+ "blocks": ("STRING", {"default": "## 0 = 1.0\n## 1 = 1.0\n## 2 = 1.0\n## 3 = 1.0\n## 4 = 1.0\n## 5 = 1.0\n## 6 = 1.0\n## 7 = 1.0\n## 8 = 1.0\n## 9 = 1.0\n## 10 = 1.0\n## 11 = 1.0\n## 12 = 1.0\n## 13 = 1.0\n## 14 = 1.0\n## 15 = 1.0\n## 16 = 1.0\n## 17 = 1.0\n## 18 = 1.0\n# 0 = 1.0\n# 1 = 1.0\n# 2 = 1.0\n# 3 = 1.0\n# 4 = 1.0\n# 5 = 1.0\n# 6 = 1.0\n# 7 = 1.0\n# 8 = 1.0\n# 9 = 1.0\n# 10 = 1.0\n# 11 = 1.0\n# 12 = 1.0\n# 13 = 1.0\n# 14 = 1.0\n# 15 = 1.0\n# 16 = 1.0\n# 17 = 1.0\n# 18 = 1.0\n# 19 = 1.0\n# 20 = 1.0\n# 21 = 1.0\n# 22 = 1.0\n# 23 = 1.0\n# 24 = 1.0\n# 25 = 1.0\n# 26 = 1.0\n# 27 = 1.0\n# 28 = 1.0\n# 29 = 1.0\n# 30 = 1.0\n# 31 = 1.0\n# 32 = 1.0\n# 33 = 1.0\n# 34 = 1.0\n# 35 = 1.0\n# 36 = 1.0\n# 37 = 1.0", "multiline": True, "dynamicPrompts": True}),
+ #**{f"double_block_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(19)},
+ #**{f"single_block_{s}": ("FLOAT", { "display": "slider", "default": 1.0, "min": 0, "max": 5, "step": 0.05 }) for s in range(38)},
+ }}
+ RETURN_TYPES = ("MODEL", "STRING")
+ RETURN_NAMES = ("MODEL", "patched_blocks")
+ FUNCTION = "patch"
+
+ CATEGORY = "essentials/conditioning"
+
+ def patch(self, model, blocks):
+ if blocks == "":
+ return (model, )
+
+ m = model.clone()
+ sd = model.model_state_dict()
+ patched_blocks = []
+
+ """
+ Also compatible with the following format:
+
+ double_blocks\.0\.(img|txt)_(mod|attn|mlp)\.(lin|qkv|proj|0|2)\.(weight|bias)=1.1
+ single_blocks\.0\.(linear[12]|modulation\.lin)\.(weight|bias)=1.1
+
+ The regex is used to match the block names
+ """
+
+ blocks = blocks.split("\n")
+ blocks = [b.strip() for b in blocks if b.strip()]
+
+ for k in sd:
+ for block in blocks:
+ block = block.split("=")
+ value = float(block[1].strip()) if len(block) > 1 else 1.0
+ block = block[0].strip()
+ if block.startswith("##"):
+ block = r"double_blocks\." + block[2:].strip() + r"\.(img|txt)_(mod|attn|mlp)\.(lin|qkv|proj|0|2)\.(weight|bias)"
+ elif block.startswith("#"):
+ block = r"single_blocks\." + block[1:].strip() + r"\.(linear[12]|modulation\.lin)\.(weight|bias)"
+
+ if value != 1.0 and re.search(block, k):
+ m.add_patches({k: (None,)}, 0.0, value)
+ patched_blocks.append(f"{k}: {value}")
+
+ patched_blocks = "\n".join(patched_blocks)
+
+ return (m, patched_blocks,)
+
+
+COND_CLASS_MAPPINGS = {
+ "CLIPTextEncodeSDXL+": CLIPTextEncodeSDXLSimplified,
+ "ConditioningCombineMultiple+": ConditioningCombineMultiple,
+ "SD3NegativeConditioning+": SD3NegativeConditioning,
+ "FluxAttentionSeeker+": FluxAttentionSeeker,
+ "SD3AttentionSeekerLG+": SD3AttentionSeekerLG,
+ "SD3AttentionSeekerT5+": SD3AttentionSeekerT5,
+ "FluxBlocksBuster+": FluxBlocksBuster,
+}
+
+COND_NAME_MAPPINGS = {
+ "CLIPTextEncodeSDXL+": "🔧 SDXL CLIPTextEncode",
+ "ConditioningCombineMultiple+": "🔧 Cond Combine Multiple",
+ "SD3NegativeConditioning+": "🔧 SD3 Negative Conditioning",
+ "FluxAttentionSeeker+": "🔧 Flux Attention Seeker",
+ "SD3AttentionSeekerLG+": "🔧 SD3 Attention Seeker L/G",
+ "SD3AttentionSeekerT5+": "🔧 SD3 Attention Seeker T5",
+ "FluxBlocksBuster+": "🔧 Flux Model Blocks Buster",
+}
\ No newline at end of file
diff --git a/ComfyUI_essentials/fonts/ShareTechMono-Regular.ttf b/ComfyUI_essentials/fonts/ShareTechMono-Regular.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..0ae0b19750c51a751bc45f54622443d55d643999
Binary files /dev/null and b/ComfyUI_essentials/fonts/ShareTechMono-Regular.ttf differ
diff --git a/ComfyUI_essentials/fonts/put_font_files_here.txt b/ComfyUI_essentials/fonts/put_font_files_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ComfyUI_essentials/histogram_matching.py b/ComfyUI_essentials/histogram_matching.py
new file mode 100644
index 0000000000000000000000000000000000000000..9118b0001a2a82f8b42fe38e81e4a3cb9a84998b
--- /dev/null
+++ b/ComfyUI_essentials/histogram_matching.py
@@ -0,0 +1,87 @@
+# from MIT licensed https://github.com/nemodleo/pytorch-histogram-matching
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Histogram_Matching(nn.Module):
+ def __init__(self, differentiable=False):
+ super(Histogram_Matching, self).__init__()
+ self.differentiable = differentiable
+
+ def forward(self, dst, ref):
+ # B C
+ B, C, H, W = dst.size()
+ # assertion
+ assert dst.device == ref.device
+ # [B*C 256]
+ hist_dst = self.cal_hist(dst)
+ hist_ref = self.cal_hist(ref)
+ # [B*C 256]
+ tables = self.cal_trans_batch(hist_dst, hist_ref)
+ # [B C H W]
+ rst = dst.clone()
+ for b in range(B):
+ for c in range(C):
+ rst[b,c] = tables[b*c, (dst[b,c] * 255).long()]
+ # [B C H W]
+ rst /= 255.
+ return rst
+
+ def cal_hist(self, img):
+ B, C, H, W = img.size()
+ # [B*C 256]
+ if self.differentiable:
+ hists = self.soft_histc_batch(img * 255, bins=256, min=0, max=256, sigma=3*25)
+ else:
+ hists = torch.stack([torch.histc(img[b,c] * 255, bins=256, min=0, max=255) for b in range(B) for c in range(C)])
+ hists = hists.float()
+ hists = F.normalize(hists, p=1)
+ # BC 256
+ bc, n = hists.size()
+ # [B*C 256 256]
+ triu = torch.ones(bc, n, n, device=hists.device).triu()
+ # [B*C 256]
+ hists = torch.bmm(hists[:,None,:], triu)[:,0,:]
+ return hists
+
+ def soft_histc_batch(self, x, bins=256, min=0, max=256, sigma=3*25):
+ # B C H W
+ B, C, H, W = x.size()
+ # [B*C H*W]
+ x = x.view(B*C, -1)
+ # 1
+ delta = float(max - min) / float(bins)
+ # [256]
+ centers = float(min) + delta * (torch.arange(bins, device=x.device, dtype=torch.bfloat16) + 0.5)
+ # [B*C 1 H*W]
+ x = torch.unsqueeze(x, 1)
+ # [1 256 1]
+ centers = centers[None,:,None]
+ # [B*C 256 H*W]
+ x = x - centers
+ # [B*C 256 H*W]
+ x = x.type(torch.bfloat16)
+ # [B*C 256 H*W]
+ x = torch.sigmoid(sigma * (x + delta/2)) - torch.sigmoid(sigma * (x - delta/2))
+ # [B*C 256]
+ x = x.sum(dim=2)
+ # [B*C 256]
+ x = x.type(torch.float32)
+ # prevent oom
+ # torch.cuda.empty_cache()
+ return x
+
+ def cal_trans_batch(self, hist_dst, hist_ref):
+ # [B*C 256 256]
+ hist_dst = hist_dst[:,None,:].repeat(1,256,1)
+ # [B*C 256 256]
+ hist_ref = hist_ref[:,:,None].repeat(1,1,256)
+ # [B*C 256 256]
+ table = hist_dst - hist_ref
+ # [B*C 256 256]
+ table = torch.where(table>=0, 1., 0.)
+ # [B*C 256]
+ table = torch.sum(table, dim=1) - 1
+ # [B*C 256]
+ table = torch.clamp(table, min=0, max=255)
+ return table
diff --git a/ComfyUI_essentials/image.py b/ComfyUI_essentials/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c8cc619a197728ff66a37f7e9b047775712950
--- /dev/null
+++ b/ComfyUI_essentials/image.py
@@ -0,0 +1,1770 @@
+from .utils import max_, min_
+from nodes import MAX_RESOLUTION
+import comfy.utils
+from nodes import SaveImage
+from node_helpers import pillow
+from PIL import Image, ImageOps
+
+import kornia
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.v2 as T
+
+#import warnings
+#warnings.filterwarnings('ignore', module="torchvision")
+import math
+import os
+import numpy as np
+import folder_paths
+from pathlib import Path
+import random
+
+"""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Image analysis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+class ImageEnhanceDifference:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image1": ("IMAGE",),
+ "image2": ("IMAGE",),
+ "exponent": ("FLOAT", { "default": 0.75, "min": 0.00, "max": 1.00, "step": 0.05, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image analysis"
+
+ def execute(self, image1, image2, exponent):
+ if image1.shape[1:] != image2.shape[1:]:
+ image2 = comfy.utils.common_upscale(image2.permute([0,3,1,2]), image1.shape[2], image1.shape[1], upscale_method='bicubic', crop='center').permute([0,2,3,1])
+
+ diff_image = image1 - image2
+ diff_image = torch.pow(diff_image, exponent)
+ diff_image = torch.clamp(diff_image, 0, 1)
+
+ return(diff_image,)
+
+"""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Batch tools
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+class ImageBatchMultiple:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image_1": ("IMAGE",),
+ "method": (["nearest-exact", "bilinear", "area", "bicubic", "lanczos"], { "default": "lanczos" }),
+ }, "optional": {
+ "image_2": ("IMAGE",),
+ "image_3": ("IMAGE",),
+ "image_4": ("IMAGE",),
+ "image_5": ("IMAGE",),
+ },
+ }
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image batch"
+
+ def execute(self, image_1, method, image_2=None, image_3=None, image_4=None, image_5=None):
+ out = image_1
+
+ if image_2 is not None:
+ if image_1.shape[1:] != image_2.shape[1:]:
+ image_2 = comfy.utils.common_upscale(image_2.movedim(-1,1), image_1.shape[2], image_1.shape[1], method, "center").movedim(1,-1)
+ out = torch.cat((image_1, image_2), dim=0)
+ if image_3 is not None:
+ if image_1.shape[1:] != image_3.shape[1:]:
+ image_3 = comfy.utils.common_upscale(image_3.movedim(-1,1), image_1.shape[2], image_1.shape[1], method, "center").movedim(1,-1)
+ out = torch.cat((out, image_3), dim=0)
+ if image_4 is not None:
+ if image_1.shape[1:] != image_4.shape[1:]:
+ image_4 = comfy.utils.common_upscale(image_4.movedim(-1,1), image_1.shape[2], image_1.shape[1], method, "center").movedim(1,-1)
+ out = torch.cat((out, image_4), dim=0)
+ if image_5 is not None:
+ if image_1.shape[1:] != image_5.shape[1:]:
+ image_5 = comfy.utils.common_upscale(image_5.movedim(-1,1), image_1.shape[2], image_1.shape[1], method, "center").movedim(1,-1)
+ out = torch.cat((out, image_5), dim=0)
+
+ return (out,)
+
+
+class ImageExpandBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "size": ("INT", { "default": 16, "min": 1, "step": 1, }),
+ "method": (["expand", "repeat all", "repeat first", "repeat last"],)
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image batch"
+
+ def execute(self, image, size, method):
+ orig_size = image.shape[0]
+
+ if orig_size == size:
+ return (image,)
+
+ if size <= 1:
+ return (image[:size],)
+
+ if 'expand' in method:
+ out = torch.empty([size] + list(image.shape)[1:], dtype=image.dtype, device=image.device)
+ if size < orig_size:
+ scale = (orig_size - 1) / (size - 1)
+ for i in range(size):
+ out[i] = image[min(round(i * scale), orig_size - 1)]
+ else:
+ scale = orig_size / size
+ for i in range(size):
+ out[i] = image[min(math.floor((i + 0.5) * scale), orig_size - 1)]
+ elif 'all' in method:
+ out = image.repeat([math.ceil(size / image.shape[0])] + [1] * (len(image.shape) - 1))[:size]
+ elif 'first' in method:
+ if size < image.shape[0]:
+ out = image[:size]
+ else:
+ out = torch.cat([image[:1].repeat(size-image.shape[0], 1, 1, 1), image], dim=0)
+ elif 'last' in method:
+ if size < image.shape[0]:
+ out = image[:size]
+ else:
+ out = torch.cat((image, image[-1:].repeat((size-image.shape[0], 1, 1, 1))), dim=0)
+
+ return (out,)
+
+class ImageFromBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE", ),
+ "start": ("INT", { "default": 0, "min": 0, "step": 1, }),
+ "length": ("INT", { "default": -1, "min": -1, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image batch"
+
+ def execute(self, image, start, length):
+ if length<0:
+ length = image.shape[0]
+ start = min(start, image.shape[0]-1)
+ length = min(image.shape[0]-start, length)
+ return (image[start:start + length], )
+
+
+class ImageListToBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ INPUT_IS_LIST = True
+ CATEGORY = "essentials/image batch"
+
+ def execute(self, image):
+ shape = image[0].shape[1:3]
+ out = []
+
+ for i in range(len(image)):
+ img = image[i]
+ if image[i].shape[1:3] != shape:
+ img = comfy.utils.common_upscale(img.permute([0,3,1,2]), shape[1], shape[0], upscale_method='bicubic', crop='center').permute([0,2,3,1])
+ out.append(img)
+
+ out = torch.cat(out, dim=0)
+
+ return (out,)
+
+class ImageBatchToList:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ OUTPUT_IS_LIST = (True,)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image batch"
+
+ def execute(self, image):
+ return ([image[i].unsqueeze(0) for i in range(image.shape[0])], )
+
+
+"""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Image manipulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+class ImageCompositeFromMaskBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image_from": ("IMAGE", ),
+ "image_to": ("IMAGE", ),
+ "mask": ("MASK", )
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, image_from, image_to, mask):
+ frames = mask.shape[0]
+
+ if image_from.shape[1] != image_to.shape[1] or image_from.shape[2] != image_to.shape[2]:
+ image_to = comfy.utils.common_upscale(image_to.permute([0,3,1,2]), image_from.shape[2], image_from.shape[1], upscale_method='bicubic', crop='center').permute([0,2,3,1])
+
+ if frames < image_from.shape[0]:
+ image_from = image_from[:frames]
+ elif frames > image_from.shape[0]:
+ image_from = torch.cat((image_from, image_from[-1].unsqueeze(0).repeat(frames-image_from.shape[0], 1, 1, 1)), dim=0)
+
+ mask = mask.unsqueeze(3).repeat(1, 1, 1, 3)
+
+ if image_from.shape[1] != mask.shape[1] or image_from.shape[2] != mask.shape[2]:
+ mask = comfy.utils.common_upscale(mask.permute([0,3,1,2]), image_from.shape[2], image_from.shape[1], upscale_method='bicubic', crop='center').permute([0,2,3,1])
+
+ out = mask * image_to + (1 - mask) * image_from
+
+ return (out, )
+
+class ImageComposite:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "destination": ("IMAGE",),
+ "source": ("IMAGE",),
+ "x": ("INT", { "default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1 }),
+ "y": ("INT", { "default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1 }),
+ "offset_x": ("INT", { "default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1 }),
+ "offset_y": ("INT", { "default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1 }),
+ },
+ "optional": {
+ "mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, destination, source, x, y, offset_x, offset_y, mask=None):
+ if mask is None:
+ mask = torch.ones_like(source)[:,:,:,0]
+
+ mask = mask.unsqueeze(-1).repeat(1, 1, 1, 3)
+
+ if mask.shape[1:3] != source.shape[1:3]:
+ mask = F.interpolate(mask.permute([0, 3, 1, 2]), size=(source.shape[1], source.shape[2]), mode='bicubic')
+ mask = mask.permute([0, 2, 3, 1])
+
+ if mask.shape[0] > source.shape[0]:
+ mask = mask[:source.shape[0]]
+ elif mask.shape[0] < source.shape[0]:
+ mask = torch.cat((mask, mask[-1:].repeat((source.shape[0]-mask.shape[0], 1, 1, 1))), dim=0)
+
+ if destination.shape[0] > source.shape[0]:
+ destination = destination[:source.shape[0]]
+ elif destination.shape[0] < source.shape[0]:
+ destination = torch.cat((destination, destination[-1:].repeat((source.shape[0]-destination.shape[0], 1, 1, 1))), dim=0)
+
+ if not isinstance(x, list):
+ x = [x]
+ if not isinstance(y, list):
+ y = [y]
+
+ if len(x) < destination.shape[0]:
+ x = x + [x[-1]] * (destination.shape[0] - len(x))
+ if len(y) < destination.shape[0]:
+ y = y + [y[-1]] * (destination.shape[0] - len(y))
+
+ x = [i + offset_x for i in x]
+ y = [i + offset_y for i in y]
+
+ output = []
+ for i in range(destination.shape[0]):
+ d = destination[i].clone()
+ s = source[i]
+ m = mask[i]
+
+ if x[i]+source.shape[2] > destination.shape[2]:
+ s = s[:, :, :destination.shape[2]-x[i], :]
+ m = m[:, :, :destination.shape[2]-x[i], :]
+ if y[i]+source.shape[1] > destination.shape[1]:
+ s = s[:, :destination.shape[1]-y[i], :, :]
+ m = m[:destination.shape[1]-y[i], :, :]
+
+ #output.append(s * m + d[y[i]:y[i]+s.shape[0], x[i]:x[i]+s.shape[1], :] * (1 - m))
+ d[y[i]:y[i]+s.shape[0], x[i]:x[i]+s.shape[1], :] = s * m + d[y[i]:y[i]+s.shape[0], x[i]:x[i]+s.shape[1], :] * (1 - m)
+ output.append(d)
+
+ output = torch.stack(output)
+
+ # apply the source to the destination at XY position using the mask
+ #for i in range(destination.shape[0]):
+ # output[i, y[i]:y[i]+source.shape[1], x[i]:x[i]+source.shape[2], :] = source * mask + destination[i, y[i]:y[i]+source.shape[1], x[i]:x[i]+source.shape[2], :] * (1 - mask)
+
+ #for x_, y_ in zip(x, y):
+ # output[:, y_:y_+source.shape[1], x_:x_+source.shape[2], :] = source * mask + destination[:, y_:y_+source.shape[1], x_:x_+source.shape[2], :] * (1 - mask)
+
+ #output[:, y:y+source.shape[1], x:x+source.shape[2], :] = source * mask + destination[:, y:y+source.shape[1], x:x+source.shape[2], :] * (1 - mask)
+ #output = destination * (1 - mask) + source * mask
+
+ return (output,)
+
+class ImageResize:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "width": ("INT", { "default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 1, }),
+ "height": ("INT", { "default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 1, }),
+ "interpolation": (["nearest", "bilinear", "bicubic", "area", "nearest-exact", "lanczos"],),
+ "method": (["stretch", "keep proportion", "fill / crop", "pad"],),
+ "condition": (["always", "downscale if bigger", "upscale if smaller", "if bigger area", "if smaller area"],),
+ "multiple_of": ("INT", { "default": 0, "min": 0, "max": 512, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE", "INT", "INT",)
+ RETURN_NAMES = ("IMAGE", "width", "height",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, image, width, height, method="stretch", interpolation="nearest", condition="always", multiple_of=0, keep_proportion=False):
+ _, oh, ow, _ = image.shape
+ x = y = x2 = y2 = 0
+ pad_left = pad_right = pad_top = pad_bottom = 0
+
+ if keep_proportion:
+ method = "keep proportion"
+
+ if multiple_of > 1:
+ width = width - (width % multiple_of)
+ height = height - (height % multiple_of)
+
+ if method == 'keep proportion' or method == 'pad':
+ if width == 0 and oh < height:
+ width = MAX_RESOLUTION
+ elif width == 0 and oh >= height:
+ width = ow
+
+ if height == 0 and ow < width:
+ height = MAX_RESOLUTION
+ elif height == 0 and ow >= width:
+ height = oh
+
+ ratio = min(width / ow, height / oh)
+ new_width = round(ow*ratio)
+ new_height = round(oh*ratio)
+
+ if method == 'pad':
+ pad_left = (width - new_width) // 2
+ pad_right = width - new_width - pad_left
+ pad_top = (height - new_height) // 2
+ pad_bottom = height - new_height - pad_top
+
+ width = new_width
+ height = new_height
+ elif method.startswith('fill'):
+ width = width if width > 0 else ow
+ height = height if height > 0 else oh
+
+ ratio = max(width / ow, height / oh)
+ new_width = round(ow*ratio)
+ new_height = round(oh*ratio)
+ x = (new_width - width) // 2
+ y = (new_height - height) // 2
+ x2 = x + width
+ y2 = y + height
+ if x2 > new_width:
+ x -= (x2 - new_width)
+ if x < 0:
+ x = 0
+ if y2 > new_height:
+ y -= (y2 - new_height)
+ if y < 0:
+ y = 0
+ width = new_width
+ height = new_height
+ else:
+ width = width if width > 0 else ow
+ height = height if height > 0 else oh
+
+ if "always" in condition \
+ or ("downscale if bigger" == condition and (oh > height or ow > width)) or ("upscale if smaller" == condition and (oh < height or ow < width)) \
+ or ("bigger area" in condition and (oh * ow > height * width)) or ("smaller area" in condition and (oh * ow < height * width)):
+
+ outputs = image.permute(0,3,1,2)
+
+ if interpolation == "lanczos":
+ outputs = comfy.utils.lanczos(outputs, width, height)
+ else:
+ outputs = F.interpolate(outputs, size=(height, width), mode=interpolation)
+
+ if method == 'pad':
+ if pad_left > 0 or pad_right > 0 or pad_top > 0 or pad_bottom > 0:
+ outputs = F.pad(outputs, (pad_left, pad_right, pad_top, pad_bottom), value=0)
+
+ outputs = outputs.permute(0,2,3,1)
+
+ if method.startswith('fill'):
+ if x > 0 or y > 0 or x2 > 0 or y2 > 0:
+ outputs = outputs[:, y:y2, x:x2, :]
+ else:
+ outputs = image
+
+ if multiple_of > 1 and (outputs.shape[2] % multiple_of != 0 or outputs.shape[1] % multiple_of != 0):
+ width = outputs.shape[2]
+ height = outputs.shape[1]
+ x = (width % multiple_of) // 2
+ y = (height % multiple_of) // 2
+ x2 = width - ((width % multiple_of) - x)
+ y2 = height - ((height % multiple_of) - y)
+ outputs = outputs[:, y:y2, x:x2, :]
+
+ outputs = torch.clamp(outputs, 0, 1)
+
+ return(outputs, outputs.shape[2], outputs.shape[1],)
+
+class ImageFlip:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "axis": (["x", "y", "xy"],),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, image, axis):
+ dim = ()
+ if "y" in axis:
+ dim += (1,)
+ if "x" in axis:
+ dim += (2,)
+ image = torch.flip(image, dim)
+
+ return(image,)
+
+class ImageCrop:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "width": ("INT", { "default": 256, "min": 0, "max": MAX_RESOLUTION, "step": 8, }),
+ "height": ("INT", { "default": 256, "min": 0, "max": MAX_RESOLUTION, "step": 8, }),
+ "position": (["top-left", "top-center", "top-right", "right-center", "bottom-right", "bottom-center", "bottom-left", "left-center", "center"],),
+ "x_offset": ("INT", { "default": 0, "min": -99999, "step": 1, }),
+ "y_offset": ("INT", { "default": 0, "min": -99999, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE","INT","INT",)
+ RETURN_NAMES = ("IMAGE","x","y",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, image, width, height, position, x_offset, y_offset):
+ _, oh, ow, _ = image.shape
+
+ width = min(ow, width)
+ height = min(oh, height)
+
+ if "center" in position:
+ x = round((ow-width) / 2)
+ y = round((oh-height) / 2)
+ if "top" in position:
+ y = 0
+ if "bottom" in position:
+ y = oh-height
+ if "left" in position:
+ x = 0
+ if "right" in position:
+ x = ow-width
+
+ x += x_offset
+ y += y_offset
+
+ x2 = x+width
+ y2 = y+height
+
+ if x2 > ow:
+ x2 = ow
+ if x < 0:
+ x = 0
+ if y2 > oh:
+ y2 = oh
+ if y < 0:
+ y = 0
+
+ image = image[:, y:y2, x:x2, :]
+
+ return(image, x, y, )
+
+class ImageTile:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "rows": ("INT", { "default": 2, "min": 1, "max": 256, "step": 1, }),
+ "cols": ("INT", { "default": 2, "min": 1, "max": 256, "step": 1, }),
+ "overlap": ("FLOAT", { "default": 0, "min": 0, "max": 0.5, "step": 0.01, }),
+ "overlap_x": ("INT", { "default": 0, "min": 0, "max": MAX_RESOLUTION//2, "step": 1, }),
+ "overlap_y": ("INT", { "default": 0, "min": 0, "max": MAX_RESOLUTION//2, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE", "INT", "INT", "INT", "INT")
+ RETURN_NAMES = ("IMAGE", "tile_width", "tile_height", "overlap_x", "overlap_y",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, image, rows, cols, overlap, overlap_x, overlap_y):
+ h, w = image.shape[1:3]
+ tile_h = h // rows
+ tile_w = w // cols
+ h = tile_h * rows
+ w = tile_w * cols
+ overlap_h = int(tile_h * overlap) + overlap_y
+ overlap_w = int(tile_w * overlap) + overlap_x
+
+ # max overlap is half of the tile size
+ overlap_h = min(tile_h // 2, overlap_h)
+ overlap_w = min(tile_w // 2, overlap_w)
+
+ if rows == 1:
+ overlap_h = 0
+ if cols == 1:
+ overlap_w = 0
+
+ tiles = []
+ for i in range(rows):
+ for j in range(cols):
+ y1 = i * tile_h
+ x1 = j * tile_w
+
+ if i > 0:
+ y1 -= overlap_h
+ if j > 0:
+ x1 -= overlap_w
+
+ y2 = y1 + tile_h + overlap_h
+ x2 = x1 + tile_w + overlap_w
+
+ if y2 > h:
+ y2 = h
+ y1 = y2 - tile_h - overlap_h
+ if x2 > w:
+ x2 = w
+ x1 = x2 - tile_w - overlap_w
+
+ tiles.append(image[:, y1:y2, x1:x2, :])
+ tiles = torch.cat(tiles, dim=0)
+
+ return(tiles, tile_w+overlap_w, tile_h+overlap_h, overlap_w, overlap_h,)
+
+class ImageUntile:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "tiles": ("IMAGE",),
+ "overlap_x": ("INT", { "default": 0, "min": 0, "max": MAX_RESOLUTION//2, "step": 1, }),
+ "overlap_y": ("INT", { "default": 0, "min": 0, "max": MAX_RESOLUTION//2, "step": 1, }),
+ "rows": ("INT", { "default": 2, "min": 1, "max": 256, "step": 1, }),
+ "cols": ("INT", { "default": 2, "min": 1, "max": 256, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, tiles, overlap_x, overlap_y, rows, cols):
+ tile_h, tile_w = tiles.shape[1:3]
+ tile_h -= overlap_y
+ tile_w -= overlap_x
+ out_w = cols * tile_w
+ out_h = rows * tile_h
+
+ out = torch.zeros((1, out_h, out_w, tiles.shape[3]), device=tiles.device, dtype=tiles.dtype)
+
+ for i in range(rows):
+ for j in range(cols):
+ y1 = i * tile_h
+ x1 = j * tile_w
+
+ if i > 0:
+ y1 -= overlap_y
+ if j > 0:
+ x1 -= overlap_x
+
+ y2 = y1 + tile_h + overlap_y
+ x2 = x1 + tile_w + overlap_x
+
+ if y2 > out_h:
+ y2 = out_h
+ y1 = y2 - tile_h - overlap_y
+ if x2 > out_w:
+ x2 = out_w
+ x1 = x2 - tile_w - overlap_x
+
+ mask = torch.ones((1, tile_h+overlap_y, tile_w+overlap_x), device=tiles.device, dtype=tiles.dtype)
+
+ # feather the overlap on top
+ if i > 0 and overlap_y > 0:
+ mask[:, :overlap_y, :] *= torch.linspace(0, 1, overlap_y, device=tiles.device, dtype=tiles.dtype).unsqueeze(1)
+ # feather the overlap on bottom
+ #if i < rows - 1:
+ # mask[:, -overlap_y:, :] *= torch.linspace(1, 0, overlap_y, device=tiles.device, dtype=tiles.dtype).unsqueeze(1)
+ # feather the overlap on left
+ if j > 0 and overlap_x > 0:
+ mask[:, :, :overlap_x] *= torch.linspace(0, 1, overlap_x, device=tiles.device, dtype=tiles.dtype).unsqueeze(0)
+ # feather the overlap on right
+ #if j < cols - 1:
+ # mask[:, :, -overlap_x:] *= torch.linspace(1, 0, overlap_x, device=tiles.device, dtype=tiles.dtype).unsqueeze(0)
+
+ mask = mask.unsqueeze(-1).repeat(1, 1, 1, tiles.shape[3])
+ tile = tiles[i * cols + j] * mask
+ out[:, y1:y2, x1:x2, :] = out[:, y1:y2, x1:x2, :] * (1 - mask) + tile
+ return(out, )
+
+class ImageSeamCarving:
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "width": ("INT", { "default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1, }),
+ "height": ("INT", { "default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1, }),
+ "energy": (["backward", "forward"],),
+ "order": (["width-first", "height-first"],),
+ },
+ "optional": {
+ "keep_mask": ("MASK",),
+ "drop_mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ CATEGORY = "essentials/image manipulation"
+ FUNCTION = "execute"
+
+ def execute(self, image, width, height, energy, order, keep_mask=None, drop_mask=None):
+ from .carve import seam_carving
+
+ img = image.permute([0, 3, 1, 2])
+
+ if keep_mask is not None:
+ #keep_mask = keep_mask.reshape((-1, 1, keep_mask.shape[-2], keep_mask.shape[-1])).movedim(1, -1)
+ keep_mask = keep_mask.unsqueeze(1)
+
+ if keep_mask.shape[2] != img.shape[2] or keep_mask.shape[3] != img.shape[3]:
+ keep_mask = F.interpolate(keep_mask, size=(img.shape[2], img.shape[3]), mode="bilinear")
+ if drop_mask is not None:
+ drop_mask = drop_mask.unsqueeze(1)
+
+ if drop_mask.shape[2] != img.shape[2] or drop_mask.shape[3] != img.shape[3]:
+ drop_mask = F.interpolate(drop_mask, size=(img.shape[2], img.shape[3]), mode="bilinear")
+
+ out = []
+ for i in range(img.shape[0]):
+ resized = seam_carving(
+ T.ToPILImage()(img[i]),
+ size=(width, height),
+ energy_mode=energy,
+ order=order,
+ keep_mask=T.ToPILImage()(keep_mask[i]) if keep_mask is not None else None,
+ drop_mask=T.ToPILImage()(drop_mask[i]) if drop_mask is not None else None,
+ )
+ out.append(T.ToTensor()(resized))
+
+ out = torch.stack(out).permute([0, 2, 3, 1])
+
+ return(out, )
+
+class ImageRandomTransform:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+ "repeat": ("INT", { "default": 1, "min": 1, "max": 256, "step": 1, }),
+ "variation": ("FLOAT", { "default": 0.1, "min": 0.0, "max": 1.0, "step": 0.05, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, image, seed, repeat, variation):
+ h, w = image.shape[1:3]
+ image = image.repeat(repeat, 1, 1, 1).permute([0, 3, 1, 2])
+
+ distortion = 0.2 * variation
+ rotation = 5 * variation
+ brightness = 0.5 * variation
+ contrast = 0.5 * variation
+ saturation = 0.5 * variation
+ hue = 0.2 * variation
+ scale = 0.5 * variation
+
+ torch.manual_seed(seed)
+
+ out = []
+ for i in image:
+ tramsforms = T.Compose([
+ T.RandomPerspective(distortion_scale=distortion, p=0.5),
+ T.RandomRotation(degrees=rotation, interpolation=T.InterpolationMode.BILINEAR, expand=True),
+ T.ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=(-hue, hue)),
+ T.RandomHorizontalFlip(p=0.5),
+ T.RandomResizedCrop((h, w), scale=(1-scale, 1+scale), ratio=(w/h, w/h), interpolation=T.InterpolationMode.BICUBIC),
+ ])
+ out.append(tramsforms(i.unsqueeze(0)))
+
+ out = torch.cat(out, dim=0).permute([0, 2, 3, 1]).clamp(0, 1)
+
+ return (out,)
+
+class RemBGSession:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "model": (["u2net: general purpose", "u2netp: lightweight general purpose", "u2net_human_seg: human segmentation", "u2net_cloth_seg: cloths Parsing", "silueta: very small u2net", "isnet-general-use: general purpose", "isnet-anime: anime illustrations", "sam: general purpose"],),
+ "providers": (['CPU', 'CUDA', 'ROCM', 'DirectML', 'OpenVINO', 'CoreML', 'Tensorrt', 'Azure'],),
+ },
+ }
+
+ RETURN_TYPES = ("REMBG_SESSION",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, model, providers):
+ from rembg import new_session, remove
+
+ model = model.split(":")[0]
+
+ class Session:
+ def __init__(self, model, providers):
+ self.session = new_session(model, providers=[providers+"ExecutionProvider"])
+ def process(self, image):
+ return remove(image, session=self.session)
+
+ return (Session(model, providers),)
+
+class TransparentBGSession:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mode": (["base", "fast", "base-nightly"],),
+ "use_jit": ("BOOLEAN", { "default": True }),
+ },
+ }
+
+ RETURN_TYPES = ("REMBG_SESSION",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, mode, use_jit):
+ from transparent_background import Remover
+
+ class Session:
+ def __init__(self, mode, use_jit):
+ self.session = Remover(mode=mode, jit=use_jit)
+ def process(self, image):
+ return self.session.process(image)
+
+ return (Session(mode, use_jit),)
+
+class ImageRemoveBackground:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "rembg_session": ("REMBG_SESSION",),
+ "image": ("IMAGE",),
+ },
+ }
+
+ RETURN_TYPES = ("IMAGE", "MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image manipulation"
+
+ def execute(self, rembg_session, image):
+ image = image.permute([0, 3, 1, 2])
+ output = []
+ for img in image:
+ img = T.ToPILImage()(img)
+ img = rembg_session.process(img)
+ output.append(T.ToTensor()(img))
+
+ output = torch.stack(output, dim=0)
+ output = output.permute([0, 2, 3, 1])
+ mask = output[:, :, :, 3] if output.shape[3] == 4 else torch.ones_like(output[:, :, :, 0])
+ # output = output[:, :, :, :3]
+
+ return(output, mask,)
+
+"""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Image processing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+class ImageDesaturate:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "factor": ("FLOAT", { "default": 1.00, "min": 0.00, "max": 1.00, "step": 0.05, }),
+ "method": (["luminance (Rec.709)", "luminance (Rec.601)", "average", "lightness"],),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ def execute(self, image, factor, method):
+ if method == "luminance (Rec.709)":
+ grayscale = 0.2126 * image[..., 0] + 0.7152 * image[..., 1] + 0.0722 * image[..., 2]
+ elif method == "luminance (Rec.601)":
+ grayscale = 0.299 * image[..., 0] + 0.587 * image[..., 1] + 0.114 * image[..., 2]
+ elif method == "average":
+ grayscale = image.mean(dim=3)
+ elif method == "lightness":
+ grayscale = (torch.max(image, dim=3)[0] + torch.min(image, dim=3)[0]) / 2
+
+ grayscale = (1.0 - factor) * image + factor * grayscale.unsqueeze(-1).repeat(1, 1, 1, 3)
+ grayscale = torch.clamp(grayscale, 0, 1)
+
+ return(grayscale,)
+
+class PixelOEPixelize:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "downscale_mode": (["contrast", "bicubic", "nearest", "center", "k-centroid"],),
+ "target_size": ("INT", { "default": 128, "min": 0, "max": MAX_RESOLUTION, "step": 8 }),
+ "patch_size": ("INT", { "default": 16, "min": 4, "max": 32, "step": 2 }),
+ "thickness": ("INT", { "default": 2, "min": 1, "max": 16, "step": 1 }),
+ "color_matching": ("BOOLEAN", { "default": True }),
+ "upscale": ("BOOLEAN", { "default": True }),
+ #"contrast": ("FLOAT", { "default": 1.0, "min": 0.0, "max": 100.0, "step": 0.1 }),
+ #"saturation": ("FLOAT", { "default": 1.0, "min": 0.0, "max": 100.0, "step": 0.1 }),
+ },
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ def execute(self, image, downscale_mode, target_size, patch_size, thickness, color_matching, upscale):
+ from pixeloe.pixelize import pixelize
+
+ image = image.clone().mul(255).clamp(0, 255).byte().cpu().numpy()
+ output = []
+ for img in image:
+ img = pixelize(img,
+ mode=downscale_mode,
+ target_size=target_size,
+ patch_size=patch_size,
+ thickness=thickness,
+ contrast=1.0,
+ saturation=1.0,
+ color_matching=color_matching,
+ no_upscale=not upscale)
+ output.append(T.ToTensor()(img))
+
+ output = torch.stack(output, dim=0).permute([0, 2, 3, 1])
+
+ return(output,)
+
+class ImagePosterize:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "threshold": ("FLOAT", { "default": 0.50, "min": 0.00, "max": 1.00, "step": 0.05, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ def execute(self, image, threshold):
+ image = image.mean(dim=3, keepdim=True)
+ image = (image > threshold).float()
+ image = image.repeat(1, 1, 1, 3)
+
+ return(image,)
+
+# From https://github.com/yoonsikp/pycubelut/blob/master/pycubelut.py (MIT license)
+class ImageApplyLUT:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "lut_file": (folder_paths.get_filename_list("luts"),),
+ "gamma_correction": ("BOOLEAN", { "default": True }),
+ "clip_values": ("BOOLEAN", { "default": True }),
+ "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.1 }),
+ }}
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ # TODO: check if we can do without numpy
+ def execute(self, image, lut_file, gamma_correction, clip_values, strength):
+ lut_file_path = folder_paths.get_full_path("luts", lut_file)
+ if not lut_file_path or not Path(lut_file_path).exists():
+ print(f"Could not find LUT file: {lut_file_path}")
+ return (image,)
+
+ from colour.io.luts.iridas_cube import read_LUT_IridasCube
+
+ device = image.device
+ lut = read_LUT_IridasCube(lut_file_path)
+ lut.name = lut_file
+
+ if clip_values:
+ if lut.domain[0].max() == lut.domain[0].min() and lut.domain[1].max() == lut.domain[1].min():
+ lut.table = np.clip(lut.table, lut.domain[0, 0], lut.domain[1, 0])
+ else:
+ if len(lut.table.shape) == 2: # 3x1D
+ for dim in range(3):
+ lut.table[:, dim] = np.clip(lut.table[:, dim], lut.domain[0, dim], lut.domain[1, dim])
+ else: # 3D
+ for dim in range(3):
+ lut.table[:, :, :, dim] = np.clip(lut.table[:, :, :, dim], lut.domain[0, dim], lut.domain[1, dim])
+
+ out = []
+ for img in image: # TODO: is this more resource efficient? should we use a batch instead?
+ lut_img = img.cpu().numpy().copy()
+
+ is_non_default_domain = not np.array_equal(lut.domain, np.array([[0., 0., 0.], [1., 1., 1.]]))
+ dom_scale = None
+ if is_non_default_domain:
+ dom_scale = lut.domain[1] - lut.domain[0]
+ lut_img = lut_img * dom_scale + lut.domain[0]
+ if gamma_correction:
+ lut_img = lut_img ** (1/2.2)
+ lut_img = lut.apply(lut_img)
+ if gamma_correction:
+ lut_img = lut_img ** (2.2)
+ if is_non_default_domain:
+ lut_img = (lut_img - lut.domain[0]) / dom_scale
+
+ lut_img = torch.from_numpy(lut_img).to(device)
+ if strength < 1.0:
+ lut_img = strength * lut_img + (1 - strength) * img
+ out.append(lut_img)
+
+ out = torch.stack(out)
+
+ return (out, )
+
+# From https://github.com/Jamy-L/Pytorch-Contrast-Adaptive-Sharpening/
+class ImageCAS:
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "amount": ("FLOAT", {"default": 0.8, "min": 0, "max": 1, "step": 0.05}),
+ },
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ CATEGORY = "essentials/image processing"
+ FUNCTION = "execute"
+
+ def execute(self, image, amount):
+ epsilon = 1e-5
+ img = F.pad(image.permute([0,3,1,2]), pad=(1, 1, 1, 1))
+
+ a = img[..., :-2, :-2]
+ b = img[..., :-2, 1:-1]
+ c = img[..., :-2, 2:]
+ d = img[..., 1:-1, :-2]
+ e = img[..., 1:-1, 1:-1]
+ f = img[..., 1:-1, 2:]
+ g = img[..., 2:, :-2]
+ h = img[..., 2:, 1:-1]
+ i = img[..., 2:, 2:]
+
+ # Computing contrast
+ cross = (b, d, e, f, h)
+ mn = min_(cross)
+ mx = max_(cross)
+
+ diag = (a, c, g, i)
+ mn2 = min_(diag)
+ mx2 = max_(diag)
+ mx = mx + mx2
+ mn = mn + mn2
+
+ # Computing local weight
+ inv_mx = torch.reciprocal(mx + epsilon)
+ amp = inv_mx * torch.minimum(mn, (2 - mx))
+
+ # scaling
+ amp = torch.sqrt(amp)
+ w = - amp * (amount * (1/5 - 1/8) + 1/8)
+ div = torch.reciprocal(1 + 4*w)
+
+ output = ((b + d + f + h)*w + e) * div
+ output = output.clamp(0, 1)
+ #output = torch.nan_to_num(output)
+
+ output = output.permute([0,2,3,1])
+
+ return (output,)
+
+class ImageSmartSharpen:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "noise_radius": ("INT", { "default": 7, "min": 1, "max": 25, "step": 1, }),
+ "preserve_edges": ("FLOAT", { "default": 0.75, "min": 0.0, "max": 1.0, "step": 0.05 }),
+ "sharpen": ("FLOAT", { "default": 5.0, "min": 0.0, "max": 25.0, "step": 0.5 }),
+ "ratio": ("FLOAT", { "default": 0.5, "min": 0.0, "max": 1.0, "step": 0.1 }),
+ }}
+
+ RETURN_TYPES = ("IMAGE",)
+ CATEGORY = "essentials/image processing"
+ FUNCTION = "execute"
+
+ def execute(self, image, noise_radius, preserve_edges, sharpen, ratio):
+ import cv2
+
+ output = []
+ #diagonal = np.sqrt(image.shape[1]**2 + image.shape[2]**2)
+ if preserve_edges > 0:
+ preserve_edges = max(1 - preserve_edges, 0.05)
+
+ for img in image:
+ if noise_radius > 1:
+ sigma = 0.3 * ((noise_radius - 1) * 0.5 - 1) + 0.8 # this is what pytorch uses for blur
+ #sigma_color = preserve_edges * (diagonal / 2048)
+ blurred = cv2.bilateralFilter(img.cpu().numpy(), noise_radius, preserve_edges, sigma)
+ blurred = torch.from_numpy(blurred)
+ else:
+ blurred = img
+
+ if sharpen > 0:
+ sharpened = kornia.enhance.sharpness(img.permute(2,0,1), sharpen).permute(1,2,0)
+ else:
+ sharpened = img
+
+ img = ratio * sharpened + (1 - ratio) * blurred
+ img = torch.clamp(img, 0, 1)
+ output.append(img)
+
+ del blurred, sharpened
+ output = torch.stack(output)
+
+ return (output,)
+
+
+class ExtractKeyframes:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "threshold": ("FLOAT", { "default": 0.85, "min": 0.00, "max": 1.00, "step": 0.01, }),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE", "STRING")
+ RETURN_NAMES = ("KEYFRAMES", "indexes")
+
+ FUNCTION = "execute"
+ CATEGORY = "essentials"
+
+ def execute(self, image, threshold):
+ window_size = 2
+
+ variations = torch.sum(torch.abs(image[1:] - image[:-1]), dim=[1, 2, 3])
+ #variations = torch.sum((image[1:] - image[:-1]) ** 2, dim=[1, 2, 3])
+ threshold = torch.quantile(variations.float(), threshold).item()
+
+ keyframes = []
+ for i in range(image.shape[0] - window_size + 1):
+ window = image[i:i + window_size]
+ variation = torch.sum(torch.abs(window[-1] - window[0])).item()
+
+ if variation > threshold:
+ keyframes.append(i + window_size - 1)
+
+ return (image[keyframes], ','.join(map(str, keyframes)),)
+
+class ImageColorMatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "reference": ("IMAGE",),
+ "color_space": (["LAB", "YCbCr", "RGB", "LUV", "YUV", "XYZ"],),
+ "factor": ("FLOAT", { "default": 1.0, "min": 0.0, "max": 1.0, "step": 0.05, }),
+ "device": (["auto", "cpu", "gpu"],),
+ "batch_size": ("INT", { "default": 0, "min": 0, "max": 1024, "step": 1, }),
+ },
+ "optional": {
+ "reference_mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ def execute(self, image, reference, color_space, factor, device, batch_size, reference_mask=None):
+ if "gpu" == device:
+ device = comfy.model_management.get_torch_device()
+ elif "auto" == device:
+ device = comfy.model_management.intermediate_device()
+ else:
+ device = 'cpu'
+
+ image = image.permute([0, 3, 1, 2])
+ reference = reference.permute([0, 3, 1, 2]).to(device)
+
+ # Ensure reference_mask is in the correct format and on the right device
+ if reference_mask is not None:
+ assert reference_mask.ndim == 3, f"Expected reference_mask to have 3 dimensions, but got {reference_mask.ndim}"
+ assert reference_mask.shape[0] == reference.shape[0], f"Frame count mismatch: reference_mask has {reference_mask.shape[0]} frames, but reference has {reference.shape[0]}"
+
+ # Reshape mask to (batch, 1, height, width)
+ reference_mask = reference_mask.unsqueeze(1).to(device)
+
+ # Ensure the mask is binary (0 or 1)
+ reference_mask = (reference_mask > 0.5).float()
+
+ # Ensure spatial dimensions match
+ if reference_mask.shape[2:] != reference.shape[2:]:
+ reference_mask = comfy.utils.common_upscale(
+ reference_mask,
+ reference.shape[3], reference.shape[2],
+ upscale_method='bicubic',
+ crop='center'
+ )
+
+ if batch_size == 0 or batch_size > image.shape[0]:
+ batch_size = image.shape[0]
+
+ if "LAB" == color_space:
+ reference = kornia.color.rgb_to_lab(reference)
+ elif "YCbCr" == color_space:
+ reference = kornia.color.rgb_to_ycbcr(reference)
+ elif "LUV" == color_space:
+ reference = kornia.color.rgb_to_luv(reference)
+ elif "YUV" == color_space:
+ reference = kornia.color.rgb_to_yuv(reference)
+ elif "XYZ" == color_space:
+ reference = kornia.color.rgb_to_xyz(reference)
+
+ reference_mean, reference_std = self.compute_mean_std(reference, reference_mask)
+
+ image_batch = torch.split(image, batch_size, dim=0)
+ output = []
+
+ for image in image_batch:
+ image = image.to(device)
+
+ if color_space == "LAB":
+ image = kornia.color.rgb_to_lab(image)
+ elif color_space == "YCbCr":
+ image = kornia.color.rgb_to_ycbcr(image)
+ elif color_space == "LUV":
+ image = kornia.color.rgb_to_luv(image)
+ elif color_space == "YUV":
+ image = kornia.color.rgb_to_yuv(image)
+ elif color_space == "XYZ":
+ image = kornia.color.rgb_to_xyz(image)
+
+ image_mean, image_std = self.compute_mean_std(image)
+
+ matched = torch.nan_to_num((image - image_mean) / image_std) * torch.nan_to_num(reference_std) + reference_mean
+ matched = factor * matched + (1 - factor) * image
+
+ if color_space == "LAB":
+ matched = kornia.color.lab_to_rgb(matched)
+ elif color_space == "YCbCr":
+ matched = kornia.color.ycbcr_to_rgb(matched)
+ elif color_space == "LUV":
+ matched = kornia.color.luv_to_rgb(matched)
+ elif color_space == "YUV":
+ matched = kornia.color.yuv_to_rgb(matched)
+ elif color_space == "XYZ":
+ matched = kornia.color.xyz_to_rgb(matched)
+
+ out = matched.permute([0, 2, 3, 1]).clamp(0, 1).to(comfy.model_management.intermediate_device())
+ output.append(out)
+
+ out = None
+ output = torch.cat(output, dim=0)
+ return (output,)
+
+ def compute_mean_std(self, tensor, mask=None):
+ if mask is not None:
+ # Apply mask to the tensor
+ masked_tensor = tensor * mask
+
+ # Calculate the sum of the mask for each channel
+ mask_sum = mask.sum(dim=[2, 3], keepdim=True)
+
+ # Avoid division by zero
+ mask_sum = torch.clamp(mask_sum, min=1e-6)
+
+ # Calculate mean and std only for masked area
+ mean = torch.nan_to_num(masked_tensor.sum(dim=[2, 3], keepdim=True) / mask_sum)
+ std = torch.sqrt(torch.nan_to_num(((masked_tensor - mean) ** 2 * mask).sum(dim=[2, 3], keepdim=True) / mask_sum))
+ else:
+ mean = tensor.mean(dim=[2, 3], keepdim=True)
+ std = tensor.std(dim=[2, 3], keepdim=True)
+ return mean, std
+
+class ImageColorMatchAdobe(ImageColorMatch):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "reference": ("IMAGE",),
+ "color_space": (["RGB", "LAB"],),
+ "luminance_factor": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 2.0, "step": 0.05}),
+ "color_intensity_factor": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 2.0, "step": 0.05}),
+ "fade_factor": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.05}),
+ "neutralization_factor": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.05}),
+ "device": (["auto", "cpu", "gpu"],),
+ },
+ "optional": {
+ "reference_mask": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ def analyze_color_statistics(self, image, mask=None):
+ # Assuming image is in RGB format
+ l, a, b = kornia.color.rgb_to_lab(image).chunk(3, dim=1)
+
+ if mask is not None:
+ # Ensure mask is binary and has the same spatial dimensions as the image
+ mask = F.interpolate(mask, size=image.shape[2:], mode='nearest')
+ mask = (mask > 0.5).float()
+
+ # Apply mask to each channel
+ l = l * mask
+ a = a * mask
+ b = b * mask
+
+ # Compute masked mean and std
+ num_pixels = mask.sum()
+ mean_l = (l * mask).sum() / num_pixels
+ mean_a = (a * mask).sum() / num_pixels
+ mean_b = (b * mask).sum() / num_pixels
+ std_l = torch.sqrt(((l - mean_l)**2 * mask).sum() / num_pixels)
+ var_ab = ((a - mean_a)**2 + (b - mean_b)**2) * mask
+ std_ab = torch.sqrt(var_ab.sum() / num_pixels)
+ else:
+ mean_l = l.mean()
+ std_l = l.std()
+ mean_a = a.mean()
+ mean_b = b.mean()
+ std_ab = torch.sqrt(a.var() + b.var())
+
+ return mean_l, std_l, mean_a, mean_b, std_ab
+
+ def apply_color_transformation(self, image, source_stats, dest_stats, L, C, N):
+ l, a, b = kornia.color.rgb_to_lab(image).chunk(3, dim=1)
+
+ # Unpack statistics
+ src_mean_l, src_std_l, src_mean_a, src_mean_b, src_std_ab = source_stats
+ dest_mean_l, dest_std_l, dest_mean_a, dest_mean_b, dest_std_ab = dest_stats
+
+ # Adjust luminance
+ l_new = (l - dest_mean_l) * (src_std_l / dest_std_l) * L + src_mean_l
+
+ # Neutralize color cast
+ a = a - N * dest_mean_a
+ b = b - N * dest_mean_b
+
+ # Adjust color intensity
+ a_new = a * (src_std_ab / dest_std_ab) * C
+ b_new = b * (src_std_ab / dest_std_ab) * C
+
+ # Combine channels
+ lab_new = torch.cat([l_new, a_new, b_new], dim=1)
+
+ # Convert back to RGB
+ rgb_new = kornia.color.lab_to_rgb(lab_new)
+
+ return rgb_new
+
+ def execute(self, image, reference, color_space, luminance_factor, color_intensity_factor, fade_factor, neutralization_factor, device, reference_mask=None):
+ if "gpu" == device:
+ device = comfy.model_management.get_torch_device()
+ elif "auto" == device:
+ device = comfy.model_management.intermediate_device()
+ else:
+ device = 'cpu'
+
+ # Ensure image and reference are in the correct shape (B, C, H, W)
+ image = image.permute(0, 3, 1, 2).to(device)
+ reference = reference.permute(0, 3, 1, 2).to(device)
+
+ # Handle reference_mask (if provided)
+ if reference_mask is not None:
+ # Ensure reference_mask is 4D (B, 1, H, W)
+ if reference_mask.ndim == 2:
+ reference_mask = reference_mask.unsqueeze(0).unsqueeze(0)
+ elif reference_mask.ndim == 3:
+ reference_mask = reference_mask.unsqueeze(1)
+ reference_mask = reference_mask.to(device)
+
+ # Analyze color statistics
+ source_stats = self.analyze_color_statistics(reference, reference_mask)
+ dest_stats = self.analyze_color_statistics(image)
+
+ # Apply color transformation
+ transformed = self.apply_color_transformation(
+ image, source_stats, dest_stats,
+ luminance_factor, color_intensity_factor, neutralization_factor
+ )
+
+ # Apply fade factor
+ result = fade_factor * transformed + (1 - fade_factor) * image
+
+ # Convert back to (B, H, W, C) format and ensure values are in [0, 1] range
+ result = result.permute(0, 2, 3, 1).clamp(0, 1).to(comfy.model_management.intermediate_device())
+
+ return (result,)
+
+
+class ImageHistogramMatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "reference": ("IMAGE",),
+ "method": (["pytorch", "skimage"],),
+ "factor": ("FLOAT", { "default": 1.0, "min": 0.0, "max": 1.0, "step": 0.05, }),
+ "device": (["auto", "cpu", "gpu"],),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image processing"
+
+ def execute(self, image, reference, method, factor, device):
+ if "gpu" == device:
+ device = comfy.model_management.get_torch_device()
+ elif "auto" == device:
+ device = comfy.model_management.intermediate_device()
+ else:
+ device = 'cpu'
+
+ if "pytorch" in method:
+ from .histogram_matching import Histogram_Matching
+
+ image = image.permute([0, 3, 1, 2]).to(device)
+ reference = reference.permute([0, 3, 1, 2]).to(device)[0].unsqueeze(0)
+ image.requires_grad = True
+ reference.requires_grad = True
+
+ out = []
+
+ for i in image:
+ i = i.unsqueeze(0)
+ hm = Histogram_Matching(differentiable=True)
+ out.append(hm(i, reference))
+ out = torch.cat(out, dim=0)
+ out = factor * out + (1 - factor) * image
+ out = out.permute([0, 2, 3, 1]).clamp(0, 1)
+ else:
+ from skimage.exposure import match_histograms
+
+ out = torch.from_numpy(match_histograms(image.cpu().numpy(), reference.cpu().numpy(), channel_axis=3)).to(device)
+ out = factor * out + (1 - factor) * image.to(device)
+
+ return (out.to(comfy.model_management.intermediate_device()),)
+
+"""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+class ImageToDevice:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "device": (["auto", "cpu", "gpu"],),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image utils"
+
+ def execute(self, image, device):
+ if "gpu" == device:
+ device = comfy.model_management.get_torch_device()
+ elif "auto" == device:
+ device = comfy.model_management.intermediate_device()
+ else:
+ device = 'cpu'
+
+ image = image.clone().to(device)
+ torch.cuda.empty_cache()
+
+ return (image,)
+
+class GetImageSize:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ }
+ }
+
+ RETURN_TYPES = ("INT", "INT", "INT",)
+ RETURN_NAMES = ("width", "height", "count")
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image utils"
+
+ def execute(self, image):
+ return (image.shape[2], image.shape[1], image.shape[0])
+
+class ImageRemoveAlpha:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ },
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image utils"
+
+ def execute(self, image):
+ if image.shape[3] == 4:
+ image = image[..., :3]
+ return (image,)
+
+class ImagePreviewFromLatent(SaveImage):
+ def __init__(self):
+ self.output_dir = folder_paths.get_temp_directory()
+ self.type = "temp"
+ self.prefix_append = "_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5))
+ self.compress_level = 1
+
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "latent": ("LATENT",),
+ "vae": ("VAE", ),
+ "tile_size": ("INT", {"default": 0, "min": 0, "max": 4096, "step": 64})
+ }, "optional": {
+ "image": (["none"], {"image_upload": False}),
+ }, "hidden": {
+ "prompt": "PROMPT",
+ "extra_pnginfo": "EXTRA_PNGINFO",
+ },
+ }
+
+ RETURN_TYPES = ("IMAGE", "MASK", "INT", "INT",)
+ RETURN_NAMES = ("IMAGE", "MASK", "width", "height",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image utils"
+
+ def execute(self, latent, vae, tile_size, prompt=None, extra_pnginfo=None, image=None, filename_prefix="ComfyUI"):
+ mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
+ ui = None
+
+ if image.startswith("clipspace"):
+ image_path = folder_paths.get_annotated_filepath(image)
+ if not os.path.exists(image_path):
+ raise ValueError(f"Clipspace image does not exist anymore, select 'none' in the image field.")
+
+ img = pillow(Image.open, image_path)
+ img = pillow(ImageOps.exif_transpose, img)
+ if img.mode == "I":
+ img = img.point(lambda i: i * (1 / 255))
+ image = img.convert("RGB")
+ image = np.array(image).astype(np.float32) / 255.0
+ image = torch.from_numpy(image)[None,]
+ if "A" in img.getbands():
+ mask = np.array(img.getchannel('A')).astype(np.float32) / 255.0
+ mask = 1. - torch.from_numpy(mask)
+ ui = {
+ "filename": os.path.basename(image_path),
+ "subfolder": os.path.dirname(image_path),
+ "type": "temp",
+ }
+ else:
+ if tile_size > 0:
+ tile_size = max(tile_size, 320)
+ image = vae.decode_tiled(latent["samples"], tile_x=tile_size // 8, tile_y=tile_size // 8, )
+ else:
+ image = vae.decode(latent["samples"])
+ ui = self.save_images(image, filename_prefix, prompt, extra_pnginfo)
+
+ out = {**ui, "result": (image, mask, image.shape[2], image.shape[1],)}
+ return out
+
+class NoiseFromImage:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "noise_strenght": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01 }),
+ "noise_size": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01 }),
+ "color_noise": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.01 }),
+ "mask_strength": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01 }),
+ "mask_scale_diff": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01 }),
+ "mask_contrast": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.1 }),
+ "saturation": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 100.0, "step": 0.1 }),
+ "contrast": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.1 }),
+ "blur": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.1 }),
+ },
+ "optional": {
+ "noise_mask": ("IMAGE",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/image utils"
+
+ def execute(self, image, noise_size, color_noise, mask_strength, mask_scale_diff, mask_contrast, noise_strenght, saturation, contrast, blur, noise_mask=None):
+ torch.manual_seed(0)
+
+ elastic_alpha = max(image.shape[1], image.shape[2])# * noise_size
+ elastic_sigma = elastic_alpha / 400 * noise_size
+
+ blur_size = int(6 * blur+1)
+ if blur_size % 2 == 0:
+ blur_size+= 1
+
+ if noise_mask is None:
+ noise_mask = image
+
+ # increase contrast of the mask
+ if mask_contrast != 1:
+ noise_mask = T.ColorJitter(contrast=(mask_contrast,mask_contrast))(noise_mask.permute([0, 3, 1, 2])).permute([0, 2, 3, 1])
+
+ # Ensure noise mask is the same size as the image
+ if noise_mask.shape[1:] != image.shape[1:]:
+ noise_mask = F.interpolate(noise_mask.permute([0, 3, 1, 2]), size=(image.shape[1], image.shape[2]), mode='bicubic', align_corners=False)
+ noise_mask = noise_mask.permute([0, 2, 3, 1])
+ # Ensure we have the same number of masks and images
+ if noise_mask.shape[0] > image.shape[0]:
+ noise_mask = noise_mask[:image.shape[0]]
+ else:
+ noise_mask = torch.cat((noise_mask, noise_mask[-1:].repeat((image.shape[0]-noise_mask.shape[0], 1, 1, 1))), dim=0)
+
+ # Convert mask to grayscale mask
+ noise_mask = noise_mask.mean(dim=3).unsqueeze(-1)
+
+ # add color noise
+ imgs = image.clone().permute([0, 3, 1, 2])
+ if color_noise > 0:
+ color_noise = torch.normal(torch.zeros_like(imgs), std=color_noise)
+ color_noise *= (imgs - imgs.min()) / (imgs.max() - imgs.min())
+
+ imgs = imgs + color_noise
+ imgs = imgs.clamp(0, 1)
+
+ # create fine and coarse noise
+ fine_noise = []
+ for n in imgs:
+ avg_color = n.mean(dim=[1,2])
+
+ tmp_noise = T.ElasticTransform(alpha=elastic_alpha, sigma=elastic_sigma, fill=avg_color.tolist())(n)
+ if blur > 0:
+ tmp_noise = T.GaussianBlur(blur_size, blur)(tmp_noise)
+ tmp_noise = T.ColorJitter(contrast=(contrast,contrast), saturation=(saturation,saturation))(tmp_noise)
+ fine_noise.append(tmp_noise)
+
+ imgs = None
+ del imgs
+
+ fine_noise = torch.stack(fine_noise, dim=0)
+ fine_noise = fine_noise.permute([0, 2, 3, 1])
+ #fine_noise = torch.stack(fine_noise, dim=0)
+ #fine_noise = pb(fine_noise)
+ mask_scale_diff = min(mask_scale_diff, 0.99)
+ if mask_scale_diff > 0:
+ coarse_noise = F.interpolate(fine_noise.permute([0, 3, 1, 2]), scale_factor=1-mask_scale_diff, mode='area')
+ coarse_noise = F.interpolate(coarse_noise, size=(fine_noise.shape[1], fine_noise.shape[2]), mode='bilinear', align_corners=False)
+ coarse_noise = coarse_noise.permute([0, 2, 3, 1])
+ else:
+ coarse_noise = fine_noise
+
+ output = (1 - noise_mask) * coarse_noise + noise_mask * fine_noise
+
+ if mask_strength < 1:
+ noise_mask = noise_mask.pow(mask_strength)
+ noise_mask = torch.nan_to_num(noise_mask).clamp(0, 1)
+ output = noise_mask * output + (1 - noise_mask) * image
+
+ # apply noise to image
+ output = output * noise_strenght + image * (1 - noise_strenght)
+ output = output.clamp(0, 1)
+
+ return (output, )
+
+IMAGE_CLASS_MAPPINGS = {
+ # Image analysis
+ "ImageEnhanceDifference+": ImageEnhanceDifference,
+
+ # Image batch
+ "ImageBatchMultiple+": ImageBatchMultiple,
+ "ImageExpandBatch+": ImageExpandBatch,
+ "ImageFromBatch+": ImageFromBatch,
+ "ImageListToBatch+": ImageListToBatch,
+ "ImageBatchToList+": ImageBatchToList,
+
+ # Image manipulation
+ "ImageCompositeFromMaskBatch+": ImageCompositeFromMaskBatch,
+ "ImageComposite+": ImageComposite,
+ "ImageCrop+": ImageCrop,
+ "ImageFlip+": ImageFlip,
+ "ImageRandomTransform+": ImageRandomTransform,
+ "ImageRemoveAlpha+": ImageRemoveAlpha,
+ "ImageRemoveBackground+": ImageRemoveBackground,
+ "ImageResize+": ImageResize,
+ "ImageSeamCarving+": ImageSeamCarving,
+ "ImageTile+": ImageTile,
+ "ImageUntile+": ImageUntile,
+ "RemBGSession+": RemBGSession,
+ "TransparentBGSession+": TransparentBGSession,
+
+ # Image processing
+ "ImageApplyLUT+": ImageApplyLUT,
+ "ImageCASharpening+": ImageCAS,
+ "ImageDesaturate+": ImageDesaturate,
+ "PixelOEPixelize+": PixelOEPixelize,
+ "ImagePosterize+": ImagePosterize,
+ "ImageColorMatch+": ImageColorMatch,
+ "ImageColorMatchAdobe+": ImageColorMatchAdobe,
+ "ImageHistogramMatch+": ImageHistogramMatch,
+ "ImageSmartSharpen+": ImageSmartSharpen,
+
+ # Utilities
+ "GetImageSize+": GetImageSize,
+ "ImageToDevice+": ImageToDevice,
+ "ImagePreviewFromLatent+": ImagePreviewFromLatent,
+ "NoiseFromImage+": NoiseFromImage,
+ #"ExtractKeyframes+": ExtractKeyframes,
+}
+
+IMAGE_NAME_MAPPINGS = {
+ # Image analysis
+ "ImageEnhanceDifference+": "🔧 Image Enhance Difference",
+
+ # Image batch
+ "ImageBatchMultiple+": "🔧 Images Batch Multiple",
+ "ImageExpandBatch+": "🔧 Image Expand Batch",
+ "ImageFromBatch+": "🔧 Image From Batch",
+ "ImageListToBatch+": "🔧 Image List To Batch",
+ "ImageBatchToList+": "🔧 Image Batch To List",
+
+ # Image manipulation
+ "ImageCompositeFromMaskBatch+": "🔧 Image Composite From Mask Batch",
+ "ImageComposite+": "🔧 Image Composite",
+ "ImageCrop+": "🔧 Image Crop",
+ "ImageFlip+": "🔧 Image Flip",
+ "ImageRandomTransform+": "🔧 Image Random Transform",
+ "ImageRemoveAlpha+": "🔧 Image Remove Alpha",
+ "ImageRemoveBackground+": "🔧 Image Remove Background",
+ "ImageResize+": "🔧 Image Resize",
+ "ImageSeamCarving+": "🔧 Image Seam Carving",
+ "ImageTile+": "🔧 Image Tile",
+ "ImageUntile+": "🔧 Image Untile",
+ "RemBGSession+": "🔧 RemBG Session",
+ "TransparentBGSession+": "🔧 InSPyReNet TransparentBG",
+
+ # Image processing
+ "ImageApplyLUT+": "🔧 Image Apply LUT",
+ "ImageCASharpening+": "🔧 Image Contrast Adaptive Sharpening",
+ "ImageDesaturate+": "🔧 Image Desaturate",
+ "PixelOEPixelize+": "🔧 Pixelize",
+ "ImagePosterize+": "🔧 Image Posterize",
+ "ImageColorMatch+": "🔧 Image Color Match",
+ "ImageColorMatchAdobe+": "🔧 Image Color Match Adobe",
+ "ImageHistogramMatch+": "🔧 Image Histogram Match",
+ "ImageSmartSharpen+": "🔧 Image Smart Sharpen",
+
+ # Utilities
+ "GetImageSize+": "🔧 Get Image Size",
+ "ImageToDevice+": "🔧 Image To Device",
+ "ImagePreviewFromLatent+": "🔧 Image Preview From Latent",
+ "NoiseFromImage+": "🔧 Noise From Image",
+}
diff --git a/ComfyUI_essentials/js/DisplayAny.js b/ComfyUI_essentials/js/DisplayAny.js
new file mode 100644
index 0000000000000000000000000000000000000000..ae7445ff56184a6156d5012522ecad650e62aee0
--- /dev/null
+++ b/ComfyUI_essentials/js/DisplayAny.js
@@ -0,0 +1,36 @@
+import { app } from "../../scripts/app.js";
+import { ComfyWidgets } from "../../scripts/widgets.js";
+
+app.registerExtension({
+ name: "essentials.DisplayAny",
+ async beforeRegisterNodeDef(nodeType, nodeData, app) {
+ if (!nodeData?.category?.startsWith("essentials")) {
+ return;
+ }
+
+ if (nodeData.name === "DisplayAny") {
+ const onExecuted = nodeType.prototype.onExecuted;
+
+ nodeType.prototype.onExecuted = function (message) {
+ onExecuted?.apply(this, arguments);
+
+ if (this.widgets) {
+ for (let i = 1; i < this.widgets.length; i++) {
+ this.widgets[i].onRemove?.();
+ }
+ this.widgets.length = 1;
+ }
+
+ // Check if the "text" widget already exists.
+ let textWidget = this.widgets && this.widgets.find(w => w.name === "displaytext");
+ if (!textWidget) {
+ textWidget = ComfyWidgets["STRING"](this, "displaytext", ["STRING", { multiline: true }], app).widget;
+ textWidget.inputEl.readOnly = true;
+ textWidget.inputEl.style.border = "none";
+ textWidget.inputEl.style.backgroundColor = "transparent";
+ }
+ textWidget.value = message["text"].join("");
+ };
+ }
+ },
+});
\ No newline at end of file
diff --git a/ComfyUI_essentials/js/FluxAttentionSeeker.js b/ComfyUI_essentials/js/FluxAttentionSeeker.js
new file mode 100644
index 0000000000000000000000000000000000000000..d8829001db6f2c677e81d2a19ab2220b79eb2662
--- /dev/null
+++ b/ComfyUI_essentials/js/FluxAttentionSeeker.js
@@ -0,0 +1,133 @@
+import { app } from "../../scripts/app.js";
+
+app.registerExtension({
+ name: "essentials.FluxAttentionSeeker",
+ async beforeRegisterNodeDef(nodeType, nodeData, app) {
+ if (!nodeData?.category?.startsWith("essentials")) {
+ return;
+ }
+
+ if (nodeData.name === "FluxAttentionSeeker+") {
+ const onCreated = nodeType.prototype.onNodeCreated;
+
+ nodeType.prototype.onNodeCreated = function () {
+ this.addWidget("button", "RESET ALL", null, () => {
+ this.widgets.forEach(w => {
+ if (w.type === "slider") {
+ w.value = 1.0;
+ }
+ });
+ });
+
+ this.addWidget("button", "ZERO ALL", null, () => {
+ this.widgets.forEach(w => {
+ if (w.type === "slider") {
+ w.value = 0.0;
+ }
+ });
+ });
+
+ this.addWidget("button", "REPEAT FIRST", null, () => {
+ var clip_value = undefined;
+ var t5_value = undefined;
+ this.widgets.forEach(w => {
+ if (w.name.startsWith('clip_l')) {
+ if (clip_value === undefined) {
+ clip_value = w.value;
+ }
+ w.value = clip_value;
+ } else if (w.name.startsWith('t5')) {
+ if (t5_value === undefined) {
+ t5_value = w.value;
+ }
+ w.value = t5_value;
+ }
+ });
+ });
+ };
+ }
+ },
+});
+
+app.registerExtension({
+ name: "essentials.SD3AttentionSeekerLG",
+ async beforeRegisterNodeDef(nodeType, nodeData, app) {
+ if (!nodeData?.category?.startsWith("essentials")) {
+ return;
+ }
+
+ if (nodeData.name === "SD3AttentionSeekerLG+") {
+ const onCreated = nodeType.prototype.onNodeCreated;
+
+ nodeType.prototype.onNodeCreated = function () {
+ this.addWidget("button", "RESET L", null, () => {
+ this.widgets.forEach(w => {
+ if (w.type === "slider" && w.name.startsWith('clip_l')) {
+ w.value = 1.0;
+ }
+ });
+ });
+ this.addWidget("button", "RESET G", null, () => {
+ this.widgets.forEach(w => {
+ if (w.type === "slider" && w.name.startsWith('clip_g')) {
+ w.value = 1.0;
+ }
+ });
+ });
+
+ this.addWidget("button", "REPEAT FIRST", null, () => {
+ var clip_l_value = undefined;
+ var clip_g_value = undefined;
+ this.widgets.forEach(w => {
+ if (w.name.startsWith('clip_l')) {
+ if (clip_l_value === undefined) {
+ clip_l_value = w.value;
+ }
+ w.value = clip_l_value;
+ } else if (w.name.startsWith('clip_g')) {
+ if (clip_g_value === undefined) {
+ clip_g_value = w.value;
+ }
+ w.value = clip_g_value;
+ }
+ });
+ });
+ };
+ }
+ },
+});
+
+app.registerExtension({
+ name: "essentials.SD3AttentionSeekerT5",
+ async beforeRegisterNodeDef(nodeType, nodeData, app) {
+ if (!nodeData?.category?.startsWith("essentials")) {
+ return;
+ }
+
+ if (nodeData.name === "SD3AttentionSeekerT5+") {
+ const onCreated = nodeType.prototype.onNodeCreated;
+
+ nodeType.prototype.onNodeCreated = function () {
+ this.addWidget("button", "RESET ALL", null, () => {
+ this.widgets.forEach(w => {
+ if (w.type === "slider") {
+ w.value = 1.0;
+ }
+ });
+ });
+
+ this.addWidget("button", "REPEAT FIRST", null, () => {
+ var t5_value = undefined;
+ this.widgets.forEach(w => {
+ if (w.name.startsWith('t5')) {
+ if (t5_value === undefined) {
+ t5_value = w.value;
+ }
+ w.value = t5_value;
+ }
+ });
+ });
+ };
+ }
+ },
+});
\ No newline at end of file
diff --git a/ComfyUI_essentials/luts/put_luts_files_here.txt b/ComfyUI_essentials/luts/put_luts_files_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ComfyUI_essentials/mask.py b/ComfyUI_essentials/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..48f92436a193bb3784a68e743b90d25ffa436eb8
--- /dev/null
+++ b/ComfyUI_essentials/mask.py
@@ -0,0 +1,596 @@
+from nodes import SaveImage
+import torch
+import torchvision.transforms.v2 as T
+import random
+import folder_paths
+import comfy.utils
+from .image import ImageExpandBatch
+from .utils import AnyType
+import numpy as np
+import scipy
+from PIL import Image
+from nodes import MAX_RESOLUTION
+import math
+
+any = AnyType("*")
+
+class MaskBlur:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK",),
+ "amount": ("INT", { "default": 6, "min": 0, "max": 256, "step": 1, }),
+ "device": (["auto", "cpu", "gpu"],),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, mask, amount, device):
+ if amount == 0:
+ return (mask,)
+
+ if "gpu" == device:
+ mask = mask.to(comfy.model_management.get_torch_device())
+ elif "cpu" == device:
+ mask = mask.to('cpu')
+
+ if amount % 2 == 0:
+ amount+= 1
+
+ if mask.dim() == 2:
+ mask = mask.unsqueeze(0)
+
+ mask = T.functional.gaussian_blur(mask.unsqueeze(1), amount).squeeze(1)
+
+ if "gpu" == device or "cpu" == device:
+ mask = mask.to(comfy.model_management.intermediate_device())
+
+ return(mask,)
+
+class MaskFlip:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK",),
+ "axis": (["x", "y", "xy"],),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, mask, axis):
+ if mask.dim() == 2:
+ mask = mask.unsqueeze(0)
+
+ dim = ()
+ if "y" in axis:
+ dim += (1,)
+ if "x" in axis:
+ dim += (2,)
+ mask = torch.flip(mask, dims=dim)
+
+ return(mask,)
+
+class MaskPreview(SaveImage):
+ def __init__(self):
+ self.output_dir = folder_paths.get_temp_directory()
+ self.type = "temp"
+ self.prefix_append = "_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5))
+ self.compress_level = 4
+
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {"mask": ("MASK",), },
+ "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},
+ }
+
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, mask, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
+ preview = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3)
+ return self.save_images(preview, filename_prefix, prompt, extra_pnginfo)
+
+class MaskBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask1": ("MASK",),
+ "mask2": ("MASK",),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask batch"
+
+ def execute(self, mask1, mask2):
+ if mask1.shape[1:] != mask2.shape[1:]:
+ mask2 = comfy.utils.common_upscale(mask2.unsqueeze(1).expand(-1,3,-1,-1), mask1.shape[2], mask1.shape[1], upscale_method='bicubic', crop='center')[:,0,:,:]
+
+ return (torch.cat((mask1, mask2), dim=0),)
+
+class MaskExpandBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK",),
+ "size": ("INT", { "default": 16, "min": 1, "step": 1, }),
+ "method": (["expand", "repeat all", "repeat first", "repeat last"],)
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask batch"
+
+ def execute(self, mask, size, method):
+ return (ImageExpandBatch().execute(mask.unsqueeze(1).expand(-1,3,-1,-1), size, method)[0][:,0,:,:],)
+
+
+class MaskBoundingBox:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK",),
+ "padding": ("INT", { "default": 0, "min": 0, "max": 4096, "step": 1, }),
+ "blur": ("INT", { "default": 0, "min": 0, "max": 256, "step": 1, }),
+ },
+ "optional": {
+ "image_optional": ("IMAGE",),
+ }
+ }
+
+ RETURN_TYPES = ("MASK", "IMAGE", "INT", "INT", "INT", "INT")
+ RETURN_NAMES = ("MASK", "IMAGE", "x", "y", "width", "height")
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, mask, padding, blur, image_optional=None):
+ if mask.dim() == 2:
+ mask = mask.unsqueeze(0)
+
+ if image_optional is None:
+ image_optional = mask.unsqueeze(3).repeat(1, 1, 1, 3)
+
+ # resize the image if it's not the same size as the mask
+ if image_optional.shape[1:] != mask.shape[1:]:
+ image_optional = comfy.utils.common_upscale(image_optional.permute([0,3,1,2]), mask.shape[2], mask.shape[1], upscale_method='bicubic', crop='center').permute([0,2,3,1])
+
+ # match batch size
+ if image_optional.shape[0] < mask.shape[0]:
+ image_optional = torch.cat((image_optional, image_optional[-1].unsqueeze(0).repeat(mask.shape[0]-image_optional.shape[0], 1, 1, 1)), dim=0)
+ elif image_optional.shape[0] > mask.shape[0]:
+ image_optional = image_optional[:mask.shape[0]]
+
+ # blur the mask
+ if blur > 0:
+ if blur % 2 == 0:
+ blur += 1
+ mask = T.functional.gaussian_blur(mask.unsqueeze(1), blur).squeeze(1)
+
+ _, y, x = torch.where(mask)
+ x1 = max(0, x.min().item() - padding)
+ x2 = min(mask.shape[2], x.max().item() + 1 + padding)
+ y1 = max(0, y.min().item() - padding)
+ y2 = min(mask.shape[1], y.max().item() + 1 + padding)
+
+ # crop the mask
+ mask = mask[:, y1:y2, x1:x2]
+ image_optional = image_optional[:, y1:y2, x1:x2, :]
+
+ return (mask, image_optional, x1, y1, x2 - x1, y2 - y1)
+
+
+class MaskFromColor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE", ),
+ "red": ("INT", { "default": 255, "min": 0, "max": 255, "step": 1, }),
+ "green": ("INT", { "default": 255, "min": 0, "max": 255, "step": 1, }),
+ "blue": ("INT", { "default": 255, "min": 0, "max": 255, "step": 1, }),
+ "threshold": ("INT", { "default": 0, "min": 0, "max": 127, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, image, red, green, blue, threshold):
+ temp = (torch.clamp(image, 0, 1.0) * 255.0).round().to(torch.int)
+ color = torch.tensor([red, green, blue])
+ lower_bound = (color - threshold).clamp(min=0)
+ upper_bound = (color + threshold).clamp(max=255)
+ lower_bound = lower_bound.view(1, 1, 1, 3)
+ upper_bound = upper_bound.view(1, 1, 1, 3)
+ mask = (temp >= lower_bound) & (temp <= upper_bound)
+ mask = mask.all(dim=-1)
+ mask = mask.float()
+
+ return (mask, )
+
+
+class MaskFromSegmentation:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE", ),
+ "segments": ("INT", { "default": 6, "min": 1, "max": 16, "step": 1, }),
+ "remove_isolated_pixels": ("INT", { "default": 0, "min": 0, "max": 32, "step": 1, }),
+ "remove_small_masks": ("FLOAT", { "default": 0.0, "min": 0., "max": 1., "step": 0.01, }),
+ "fill_holes": ("BOOLEAN", { "default": False }),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, image, segments, remove_isolated_pixels, fill_holes, remove_small_masks):
+ im = image[0] # we only work on the first image in the batch
+ im = Image.fromarray((im * 255).to(torch.uint8).cpu().numpy(), mode="RGB")
+ im = im.quantize(palette=im.quantize(colors=segments), dither=Image.Dither.NONE)
+ im = torch.tensor(np.array(im.convert("RGB"))).float() / 255.0
+
+ colors = im.reshape(-1, im.shape[-1])
+ colors = torch.unique(colors, dim=0)
+
+ masks = []
+ for color in colors:
+ mask = (im == color).all(dim=-1).float()
+ # remove isolated pixels
+ if remove_isolated_pixels > 0:
+ mask = torch.from_numpy(scipy.ndimage.binary_opening(mask.cpu().numpy(), structure=np.ones((remove_isolated_pixels, remove_isolated_pixels))))
+
+ # fill holes
+ if fill_holes:
+ mask = torch.from_numpy(scipy.ndimage.binary_fill_holes(mask.cpu().numpy()))
+
+ # if the mask is too small, it's probably noise
+ if mask.sum() / (mask.shape[0]*mask.shape[1]) > remove_small_masks:
+ masks.append(mask)
+
+ if masks == []:
+ masks.append(torch.zeros_like(im)[:,:,0]) # return an empty mask if no masks were found, prevents errors
+
+ mask = torch.stack(masks, dim=0).float()
+
+ return (mask, )
+
+
+class MaskFix:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK",),
+ "erode_dilate": ("INT", { "default": 0, "min": -256, "max": 256, "step": 1, }),
+ "fill_holes": ("INT", { "default": 0, "min": 0, "max": 128, "step": 1, }),
+ "remove_isolated_pixels": ("INT", { "default": 0, "min": 0, "max": 32, "step": 1, }),
+ "smooth": ("INT", { "default": 0, "min": 0, "max": 256, "step": 1, }),
+ "blur": ("INT", { "default": 0, "min": 0, "max": 256, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, mask, erode_dilate, smooth, remove_isolated_pixels, blur, fill_holes):
+ masks = []
+ for m in mask:
+ # erode and dilate
+ if erode_dilate != 0:
+ if erode_dilate < 0:
+ m = torch.from_numpy(scipy.ndimage.grey_erosion(m.cpu().numpy(), size=(-erode_dilate, -erode_dilate)))
+ else:
+ m = torch.from_numpy(scipy.ndimage.grey_dilation(m.cpu().numpy(), size=(erode_dilate, erode_dilate)))
+
+ # fill holes
+ if fill_holes > 0:
+ #m = torch.from_numpy(scipy.ndimage.binary_fill_holes(m.cpu().numpy(), structure=np.ones((fill_holes,fill_holes)))).float()
+ m = torch.from_numpy(scipy.ndimage.grey_closing(m.cpu().numpy(), size=(fill_holes, fill_holes)))
+
+ # remove isolated pixels
+ if remove_isolated_pixels > 0:
+ m = torch.from_numpy(scipy.ndimage.grey_opening(m.cpu().numpy(), size=(remove_isolated_pixels, remove_isolated_pixels)))
+
+ # smooth the mask
+ if smooth > 0:
+ if smooth % 2 == 0:
+ smooth += 1
+ m = T.functional.gaussian_blur((m > 0.5).unsqueeze(0), smooth).squeeze(0)
+
+ # blur the mask
+ if blur > 0:
+ if blur % 2 == 0:
+ blur += 1
+ m = T.functional.gaussian_blur(m.float().unsqueeze(0), blur).squeeze(0)
+
+ masks.append(m.float())
+
+ masks = torch.stack(masks, dim=0).float()
+
+ return (masks, )
+
+class MaskSmooth:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK",),
+ "amount": ("INT", { "default": 0, "min": 0, "max": 127, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, mask, amount):
+ if amount == 0:
+ return (mask,)
+
+ if amount % 2 == 0:
+ amount += 1
+
+ mask = mask > 0.5
+ mask = T.functional.gaussian_blur(mask.unsqueeze(1), amount).squeeze(1).float()
+
+ return (mask,)
+
+class MaskFromBatch:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "mask": ("MASK", ),
+ "start": ("INT", { "default": 0, "min": 0, "step": 1, }),
+ "length": ("INT", { "default": 1, "min": 1, "step": 1, }),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask batch"
+
+ def execute(self, mask, start, length):
+ if length > mask.shape[0]:
+ length = mask.shape[0]
+
+ start = min(start, mask.shape[0]-1)
+ length = min(mask.shape[0]-start, length)
+ return (mask[start:start + length], )
+
+class MaskFromList:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "width": ("INT", { "default": 32, "min": 0, "max": MAX_RESOLUTION, "step": 8, }),
+ "height": ("INT", { "default": 32, "min": 0, "max": MAX_RESOLUTION, "step": 8, }),
+ }, "optional": {
+ "values": (any, { "default": 0.0, "min": 0.0, "max": 1.0, }),
+ "str_values": ("STRING", { "default": "", "multiline": True, "placeholder": "0.0, 0.5, 1.0",}),
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, width, height, values=None, str_values=""):
+ out = []
+
+ if values is not None:
+ if not isinstance(values, list):
+ out = [values]
+ else:
+ out.extend([float(v) for v in values])
+
+ if str_values != "":
+ str_values = [float(v) for v in str_values.split(",")]
+ out.extend(str_values)
+
+ if out == []:
+ raise ValueError("No values provided")
+
+ out = torch.tensor(out).float().clamp(0.0, 1.0)
+ out = out.view(-1, 1, 1).expand(-1, height, width)
+
+ values = None
+ str_values = ""
+
+ return (out, )
+
+class MaskFromRGBCMYBW:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "image": ("IMAGE", ),
+ "threshold_r": ("FLOAT", { "default": 0.15, "min": 0.0, "max": 1, "step": 0.01, }),
+ "threshold_g": ("FLOAT", { "default": 0.15, "min": 0.0, "max": 1, "step": 0.01, }),
+ "threshold_b": ("FLOAT", { "default": 0.15, "min": 0.0, "max": 1, "step": 0.01, }),
+ }
+ }
+
+ RETURN_TYPES = ("MASK","MASK","MASK","MASK","MASK","MASK","MASK","MASK",)
+ RETURN_NAMES = ("red","green","blue","cyan","magenta","yellow","black","white",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def execute(self, image, threshold_r, threshold_g, threshold_b):
+ red = ((image[..., 0] >= 1-threshold_r) & (image[..., 1] < threshold_g) & (image[..., 2] < threshold_b)).float()
+ green = ((image[..., 0] < threshold_r) & (image[..., 1] >= 1-threshold_g) & (image[..., 2] < threshold_b)).float()
+ blue = ((image[..., 0] < threshold_r) & (image[..., 1] < threshold_g) & (image[..., 2] >= 1-threshold_b)).float()
+
+ cyan = ((image[..., 0] < threshold_r) & (image[..., 1] >= 1-threshold_g) & (image[..., 2] >= 1-threshold_b)).float()
+ magenta = ((image[..., 0] >= 1-threshold_r) & (image[..., 1] < threshold_g) & (image[..., 2] > 1-threshold_b)).float()
+ yellow = ((image[..., 0] >= 1-threshold_r) & (image[..., 1] >= 1-threshold_g) & (image[..., 2] < threshold_b)).float()
+
+ black = ((image[..., 0] <= threshold_r) & (image[..., 1] <= threshold_g) & (image[..., 2] <= threshold_b)).float()
+ white = ((image[..., 0] >= 1-threshold_r) & (image[..., 1] >= 1-threshold_g) & (image[..., 2] >= 1-threshold_b)).float()
+
+ return (red, green, blue, cyan, magenta, yellow, black, white,)
+
+class TransitionMask:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "width": ("INT", { "default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1, }),
+ "height": ("INT", { "default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1, }),
+ "frames": ("INT", { "default": 16, "min": 1, "max": 9999, "step": 1, }),
+ "start_frame": ("INT", { "default": 0, "min": 0, "step": 1, }),
+ "end_frame": ("INT", { "default": 9999, "min": 0, "step": 1, }),
+ "transition_type": (["horizontal slide", "vertical slide", "horizontal bar", "vertical bar", "center box", "horizontal door", "vertical door", "circle", "fade"],),
+ "timing_function": (["linear", "in", "out", "in-out"],)
+ }
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/mask"
+
+ def linear(self, i, t):
+ return i/t
+ def ease_in(self, i, t):
+ return pow(i/t, 2)
+ def ease_out(self, i, t):
+ return 1 - pow(1 - i/t, 2)
+ def ease_in_out(self, i, t):
+ if i < t/2:
+ return pow(i/(t/2), 2) / 2
+ else:
+ return 1 - pow(1 - (i - t/2)/(t/2), 2) / 2
+
+ def execute(self, width, height, frames, start_frame, end_frame, transition_type, timing_function):
+ if timing_function == 'in':
+ timing_function = self.ease_in
+ elif timing_function == 'out':
+ timing_function = self.ease_out
+ elif timing_function == 'in-out':
+ timing_function = self.ease_in_out
+ else:
+ timing_function = self.linear
+
+ out = []
+
+ end_frame = min(frames, end_frame)
+ transition = end_frame - start_frame
+
+ if start_frame > 0:
+ out = out + [torch.full((height, width), 0.0, dtype=torch.float32, device="cpu")] * start_frame
+
+ for i in range(transition):
+ frame = torch.full((height, width), 0.0, dtype=torch.float32, device="cpu")
+ progress = timing_function(i, transition-1)
+
+ if "horizontal slide" in transition_type:
+ pos = round(width*progress)
+ frame[:, :pos] = 1.0
+ elif "vertical slide" in transition_type:
+ pos = round(height*progress)
+ frame[:pos, :] = 1.0
+ elif "box" in transition_type:
+ box_w = round(width*progress)
+ box_h = round(height*progress)
+ x1 = (width - box_w) // 2
+ y1 = (height - box_h) // 2
+ x2 = x1 + box_w
+ y2 = y1 + box_h
+ frame[y1:y2, x1:x2] = 1.0
+ elif "circle" in transition_type:
+ radius = math.ceil(math.sqrt(pow(width,2)+pow(height,2))*progress/2)
+ c_x = width // 2
+ c_y = height // 2
+ # is this real life? Am I hallucinating?
+ x = torch.arange(0, width, dtype=torch.float32, device="cpu")
+ y = torch.arange(0, height, dtype=torch.float32, device="cpu")
+ y, x = torch.meshgrid((y, x), indexing="ij")
+ circle = ((x - c_x) ** 2 + (y - c_y) ** 2) <= (radius ** 2)
+ frame[circle] = 1.0
+ elif "horizontal bar" in transition_type:
+ bar = round(height*progress)
+ y1 = (height - bar) // 2
+ y2 = y1 + bar
+ frame[y1:y2, :] = 1.0
+ elif "vertical bar" in transition_type:
+ bar = round(width*progress)
+ x1 = (width - bar) // 2
+ x2 = x1 + bar
+ frame[:, x1:x2] = 1.0
+ elif "horizontal door" in transition_type:
+ bar = math.ceil(height*progress/2)
+ if bar > 0:
+ frame[:bar, :] = 1.0
+ frame[-bar:, :] = 1.0
+ elif "vertical door" in transition_type:
+ bar = math.ceil(width*progress/2)
+ if bar > 0:
+ frame[:, :bar] = 1.0
+ frame[:, -bar:] = 1.0
+ elif "fade" in transition_type:
+ frame[:,:] = progress
+
+ out.append(frame)
+
+ if end_frame < frames:
+ out = out + [torch.full((height, width), 1.0, dtype=torch.float32, device="cpu")] * (frames - end_frame)
+
+ out = torch.stack(out, dim=0)
+
+ return (out, )
+
+MASK_CLASS_MAPPINGS = {
+ "MaskBlur+": MaskBlur,
+ "MaskBoundingBox+": MaskBoundingBox,
+ "MaskFix+": MaskFix,
+ "MaskFlip+": MaskFlip,
+ "MaskFromColor+": MaskFromColor,
+ "MaskFromList+": MaskFromList,
+ "MaskFromRGBCMYBW+": MaskFromRGBCMYBW,
+ "MaskFromSegmentation+": MaskFromSegmentation,
+ "MaskPreview+": MaskPreview,
+ "MaskSmooth+": MaskSmooth,
+ "TransitionMask+": TransitionMask,
+
+ # Batch
+ "MaskBatch+": MaskBatch,
+ "MaskExpandBatch+": MaskExpandBatch,
+ "MaskFromBatch+": MaskFromBatch,
+}
+
+MASK_NAME_MAPPINGS = {
+ "MaskBlur+": "🔧 Mask Blur",
+ "MaskFix+": "🔧 Mask Fix",
+ "MaskFlip+": "🔧 Mask Flip",
+ "MaskFromColor+": "🔧 Mask From Color",
+ "MaskFromList+": "🔧 Mask From List",
+ "MaskFromRGBCMYBW+": "🔧 Mask From RGB/CMY/BW",
+ "MaskFromSegmentation+": "🔧 Mask From Segmentation",
+ "MaskPreview+": "🔧 Mask Preview",
+ "MaskBoundingBox+": "🔧 Mask Bounding Box",
+ "MaskSmooth+": "🔧 Mask Smooth",
+ "TransitionMask+": "🔧 Transition Mask",
+
+ "MaskBatch+": "🔧 Mask Batch",
+ "MaskExpandBatch+": "🔧 Mask Expand Batch",
+ "MaskFromBatch+": "🔧 Mask From Batch",
+}
diff --git a/ComfyUI_essentials/misc.py b/ComfyUI_essentials/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..317bd6a5b5c7348d4d718ca5765c130af89ace99
--- /dev/null
+++ b/ComfyUI_essentials/misc.py
@@ -0,0 +1,574 @@
+import math
+import torch
+from .utils import AnyType
+import comfy.model_management
+from nodes import MAX_RESOLUTION
+import time
+
+any = AnyType("*")
+
+class SimpleMathFloat:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": ("FLOAT", { "default": 0.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.05 }),
+ },
+ }
+
+ RETURN_TYPES = ("FLOAT", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value):
+ return (float(value), )
+
+class SimpleMathPercent:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": ("FLOAT", { "default": 0.0, "min": 0, "max": 1, "step": 0.05 }),
+ },
+ }
+
+ RETURN_TYPES = ("FLOAT", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value):
+ return (float(value), )
+
+class SimpleMathInt:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": ("INT", { "default": 0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 1 }),
+ },
+ }
+
+ RETURN_TYPES = ("INT",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value):
+ return (int(value), )
+
+class SimpleMathSlider:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": ("FLOAT", { "display": "slider", "default": 0.5, "min": 0.0, "max": 1.0, "step": 0.001 }),
+ "min": ("FLOAT", { "default": 0.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.001 }),
+ "max": ("FLOAT", { "default": 1.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.001 }),
+ "rounding": ("INT", { "default": 0, "min": 0, "max": 10, "step": 1 }),
+ },
+ }
+
+ RETURN_TYPES = ("FLOAT", "INT",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value, min, max, rounding):
+ value = min + value * (max - min)
+
+ if rounding > 0:
+ value = round(value, rounding)
+
+ return (value, int(value), )
+
+class SimpleMathSliderLowRes:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": ("INT", { "display": "slider", "default": 5, "min": 0, "max": 10, "step": 1 }),
+ "min": ("FLOAT", { "default": 0.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.001 }),
+ "max": ("FLOAT", { "default": 1.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.001 }),
+ "rounding": ("INT", { "default": 0, "min": 0, "max": 10, "step": 1 }),
+ },
+ }
+
+ RETURN_TYPES = ("FLOAT", "INT",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value, min, max, rounding):
+ value = 0.1 * value
+ value = min + value * (max - min)
+ if rounding > 0:
+ value = round(value, rounding)
+
+ return (value, )
+
+class SimpleMathBoolean:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": ("BOOLEAN", { "default": False }),
+ },
+ }
+
+ RETURN_TYPES = ("BOOLEAN",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value):
+ return (value, int(value), )
+
+class SimpleMath:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "optional": {
+ "a": (any, { "default": 0.0 }),
+ "b": (any, { "default": 0.0 }),
+ "c": (any, { "default": 0.0 }),
+ },
+ "required": {
+ "value": ("STRING", { "multiline": False, "default": "" }),
+ },
+ }
+
+ RETURN_TYPES = ("INT", "FLOAT", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value, a = 0.0, b = 0.0, c = 0.0, d = 0.0):
+ import ast
+ import operator as op
+
+ h, w = 0.0, 0.0
+ if hasattr(a, 'shape'):
+ a = list(a.shape)
+ if hasattr(b, 'shape'):
+ b = list(b.shape)
+ if hasattr(c, 'shape'):
+ c = list(c.shape)
+ if hasattr(d, 'shape'):
+ d = list(d.shape)
+
+ if isinstance(a, str):
+ a = float(a)
+ if isinstance(b, str):
+ b = float(b)
+ if isinstance(c, str):
+ c = float(c)
+ if isinstance(d, str):
+ d = float(d)
+
+ operators = {
+ ast.Add: op.add,
+ ast.Sub: op.sub,
+ ast.Mult: op.mul,
+ ast.Div: op.truediv,
+ ast.FloorDiv: op.floordiv,
+ ast.Pow: op.pow,
+ #ast.BitXor: op.xor,
+ #ast.BitOr: op.or_,
+ #ast.BitAnd: op.and_,
+ ast.USub: op.neg,
+ ast.Mod: op.mod,
+ ast.Eq: op.eq,
+ ast.NotEq: op.ne,
+ ast.Lt: op.lt,
+ ast.LtE: op.le,
+ ast.Gt: op.gt,
+ ast.GtE: op.ge,
+ ast.And: lambda x, y: x and y,
+ ast.Or: lambda x, y: x or y,
+ ast.Not: op.not_
+ }
+
+ op_functions = {
+ 'min': min,
+ 'max': max,
+ 'round': round,
+ 'sum': sum,
+ 'len': len,
+ }
+
+ def eval_(node):
+ if isinstance(node, ast.Num): # number
+ return node.n
+ elif isinstance(node, ast.Name): # variable
+ if node.id == "a":
+ return a
+ if node.id == "b":
+ return b
+ if node.id == "c":
+ return c
+ if node.id == "d":
+ return d
+ elif isinstance(node, ast.BinOp): #
+ return operators[type(node.op)](eval_(node.left), eval_(node.right))
+ elif isinstance(node, ast.UnaryOp): # e.g., -1
+ return operators[type(node.op)](eval_(node.operand))
+ elif isinstance(node, ast.Compare): # comparison operators
+ left = eval_(node.left)
+ for op, comparator in zip(node.ops, node.comparators):
+ if not operators[type(op)](left, eval_(comparator)):
+ return 0
+ return 1
+ elif isinstance(node, ast.BoolOp): # boolean operators (And, Or)
+ values = [eval_(value) for value in node.values]
+ return operators[type(node.op)](*values)
+ elif isinstance(node, ast.Call): # custom function
+ if node.func.id in op_functions:
+ args =[eval_(arg) for arg in node.args]
+ return op_functions[node.func.id](*args)
+ elif isinstance(node, ast.Subscript): # indexing or slicing
+ value = eval_(node.value)
+ if isinstance(node.slice, ast.Constant):
+ return value[node.slice.value]
+ else:
+ return 0
+ else:
+ return 0
+
+ result = eval_(ast.parse(value, mode='eval').body)
+
+ if math.isnan(result):
+ result = 0.0
+
+ return (round(result), result, )
+
+class SimpleMathDual:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "optional": {
+ "a": (any, { "default": 0.0 }),
+ "b": (any, { "default": 0.0 }),
+ "c": (any, { "default": 0.0 }),
+ "d": (any, { "default": 0.0 }),
+ },
+ "required": {
+ "value_1": ("STRING", { "multiline": False, "default": "" }),
+ "value_2": ("STRING", { "multiline": False, "default": "" }),
+ },
+ }
+
+ RETURN_TYPES = ("INT", "FLOAT", "INT", "FLOAT", )
+ RETURN_NAMES = ("int_1", "float_1", "int_2", "float_2" )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, value_1, value_2, a = 0.0, b = 0.0, c = 0.0, d = 0.0):
+ return SimpleMath().execute(value_1, a, b, c, d) + SimpleMath().execute(value_2, a, b, c, d)
+
+class SimpleMathCondition:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "optional": {
+ "a": (any, { "default": 0.0 }),
+ "b": (any, { "default": 0.0 }),
+ "c": (any, { "default": 0.0 }),
+ },
+ "required": {
+ "evaluate": (any, {"default": 0}),
+ "on_true": ("STRING", { "multiline": False, "default": "" }),
+ "on_false": ("STRING", { "multiline": False, "default": "" }),
+ },
+ }
+
+ RETURN_TYPES = ("INT", "FLOAT", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, evaluate, on_true, on_false, a = 0.0, b = 0.0, c = 0.0):
+ return SimpleMath().execute(on_true if evaluate else on_false, a, b, c)
+
+class SimpleCondition:
+ def __init__(self):
+ pass
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {
+ "required": {
+ "evaluate": (any, {"default": 0}),
+ "on_true": (any, {"default": 0}),
+ },
+ "optional": {
+ "on_false": (any, {"default": None}),
+ },
+ }
+
+ RETURN_TYPES = (any,)
+ RETURN_NAMES = ("result",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, evaluate, on_true, on_false=None):
+ from comfy_execution.graph import ExecutionBlocker
+ if not evaluate:
+ return (on_false if on_false is not None else ExecutionBlocker(None),)
+
+ return (on_true,)
+
+class SimpleComparison:
+ def __init__(self):
+ pass
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {
+ "required": {
+ "a": (any, {"default": 0}),
+ "b": (any, {"default": 0}),
+ "comparison": (["==", "!=", "<", "<=", ">", ">="],),
+ },
+ }
+
+ RETURN_TYPES = ("BOOLEAN",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, a, b, comparison):
+ if comparison == "==":
+ return (a == b,)
+ elif comparison == "!=":
+ return (a != b,)
+ elif comparison == "<":
+ return (a < b,)
+ elif comparison == "<=":
+ return (a <= b,)
+ elif comparison == ">":
+ return (a > b,)
+ elif comparison == ">=":
+ return (a >= b,)
+
+class ConsoleDebug:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "value": (any, {}),
+ },
+ "optional": {
+ "prefix": ("STRING", { "multiline": False, "default": "Value:" })
+ }
+ }
+
+ RETURN_TYPES = ()
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+ OUTPUT_NODE = True
+
+ def execute(self, value, prefix):
+ print(f"\033[96m{prefix} {value}\033[0m")
+
+ return (None,)
+
+class DebugTensorShape:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "tensor": (any, {}),
+ },
+ }
+
+ RETURN_TYPES = ()
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+ OUTPUT_NODE = True
+
+ def execute(self, tensor):
+ shapes = []
+ def tensorShape(tensor):
+ if isinstance(tensor, dict):
+ for k in tensor:
+ tensorShape(tensor[k])
+ elif isinstance(tensor, list):
+ for i in range(len(tensor)):
+ tensorShape(tensor[i])
+ elif hasattr(tensor, 'shape'):
+ shapes.append(list(tensor.shape))
+
+ tensorShape(tensor)
+
+ print(f"\033[96mShapes found: {shapes}\033[0m")
+
+ return (None,)
+
+class BatchCount:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "batch": (any, {}),
+ },
+ }
+
+ RETURN_TYPES = ("INT",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, batch):
+ count = 0
+ if hasattr(batch, 'shape'):
+ count = batch.shape[0]
+ elif isinstance(batch, dict) and 'samples' in batch:
+ count = batch['samples'].shape[0]
+ elif isinstance(batch, list) or isinstance(batch, dict):
+ count = len(batch)
+
+ return (count, )
+
+class ModelCompile():
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "model": ("MODEL",),
+ "fullgraph": ("BOOLEAN", { "default": False }),
+ "dynamic": ("BOOLEAN", { "default": False }),
+ "mode": (["default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],),
+ },
+ }
+
+ RETURN_TYPES = ("MODEL", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, model, fullgraph, dynamic, mode):
+ work_model = model.clone()
+ torch._dynamo.config.suppress_errors = True
+ work_model.add_object_patch("diffusion_model", torch.compile(model=work_model.get_model_object("diffusion_model"), dynamic=dynamic, fullgraph=fullgraph, mode=mode))
+ return (work_model, )
+
+class RemoveLatentMask:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": { "samples": ("LATENT",),}}
+ RETURN_TYPES = ("LATENT",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, samples):
+ s = samples.copy()
+ if "noise_mask" in s:
+ del s["noise_mask"]
+
+ return (s,)
+
+class SDXLEmptyLatentSizePicker:
+ def __init__(self):
+ self.device = comfy.model_management.intermediate_device()
+
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "resolution": (["704x1408 (0.5)","704x1344 (0.52)","768x1344 (0.57)","768x1280 (0.6)","832x1216 (0.68)","832x1152 (0.72)","896x1152 (0.78)","896x1088 (0.82)","960x1088 (0.88)","960x1024 (0.94)","1024x1024 (1.0)","1024x960 (1.07)","1088x960 (1.13)","1088x896 (1.21)","1152x896 (1.29)","1152x832 (1.38)","1216x832 (1.46)","1280x768 (1.67)","1344x768 (1.75)","1344x704 (1.91)","1408x704 (2.0)","1472x704 (2.09)","1536x640 (2.4)","1600x640 (2.5)","1664x576 (2.89)","1728x576 (3.0)",], {"default": "1024x1024 (1.0)"}),
+ "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+ "width_override": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 8}),
+ "height_override": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 8}),
+ }}
+
+ RETURN_TYPES = ("LATENT","INT","INT",)
+ RETURN_NAMES = ("LATENT","width","height",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, resolution, batch_size, width_override=0, height_override=0):
+ width, height = resolution.split(" ")[0].split("x")
+ width = width_override if width_override > 0 else int(width)
+ height = height_override if height_override > 0 else int(height)
+
+ latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=self.device)
+
+ return ({"samples":latent}, width, height,)
+
+class DisplayAny:
+ def __init__(self):
+ pass
+
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "input": (("*",{})),
+ "mode": (["raw value", "tensor shape"],),
+ },
+ }
+
+ @classmethod
+ def VALIDATE_INPUTS(s, input_types):
+ return True
+
+ RETURN_TYPES = ("STRING",)
+ FUNCTION = "execute"
+ OUTPUT_NODE = True
+
+ CATEGORY = "essentials/utilities"
+
+ def execute(self, input, mode):
+ if mode == "tensor shape":
+ text = []
+ def tensorShape(tensor):
+ if isinstance(tensor, dict):
+ for k in tensor:
+ tensorShape(tensor[k])
+ elif isinstance(tensor, list):
+ for i in range(len(tensor)):
+ tensorShape(tensor[i])
+ elif hasattr(tensor, 'shape'):
+ text.append(list(tensor.shape))
+
+ tensorShape(input)
+ input = text
+
+ text = str(input)
+
+ return {"ui": {"text": text}, "result": (text,)}
+
+MISC_CLASS_MAPPINGS = {
+ "BatchCount+": BatchCount,
+ "ConsoleDebug+": ConsoleDebug,
+ "DebugTensorShape+": DebugTensorShape,
+ "DisplayAny": DisplayAny,
+ "ModelCompile+": ModelCompile,
+ "RemoveLatentMask+": RemoveLatentMask,
+ "SDXLEmptyLatentSizePicker+": SDXLEmptyLatentSizePicker,
+ "SimpleComparison+": SimpleComparison,
+ "SimpleCondition+": SimpleCondition,
+ "SimpleMath+": SimpleMath,
+ "SimpleMathDual+": SimpleMathDual,
+ "SimpleMathCondition+": SimpleMathCondition,
+ "SimpleMathBoolean+": SimpleMathBoolean,
+ "SimpleMathFloat+": SimpleMathFloat,
+ "SimpleMathInt+": SimpleMathInt,
+ "SimpleMathPercent+": SimpleMathPercent,
+ "SimpleMathSlider+": SimpleMathSlider,
+ "SimpleMathSliderLowRes+": SimpleMathSliderLowRes,
+}
+
+MISC_NAME_MAPPINGS = {
+ "BatchCount+": "🔧 Batch Count",
+ "ConsoleDebug+": "🔧 Console Debug",
+ "DebugTensorShape+": "🔧 Debug Tensor Shape",
+ "DisplayAny": "🔧 Display Any",
+ "ModelCompile+": "🔧 Model Compile",
+ "RemoveLatentMask+": "🔧 Remove Latent Mask",
+ "SDXLEmptyLatentSizePicker+": "🔧 Empty Latent Size Picker",
+ "SimpleComparison+": "🔧 Simple Comparison",
+ "SimpleCondition+": "🔧 Simple Condition",
+ "SimpleMath+": "🔧 Simple Math",
+ "SimpleMathDual+": "🔧 Simple Math Dual",
+ "SimpleMathCondition+": "🔧 Simple Math Condition",
+ "SimpleMathBoolean+": "🔧 Simple Math Boolean",
+ "SimpleMathFloat+": "🔧 Simple Math Float",
+ "SimpleMathInt+": "🔧 Simple Math Int",
+ "SimpleMathPercent+": "🔧 Simple Math Percent",
+ "SimpleMathSlider+": "🔧 Simple Math Slider",
+ "SimpleMathSliderLowRes+": "🔧 Simple Math Slider low-res",
+}
\ No newline at end of file
diff --git a/ComfyUI_essentials/pyproject.toml b/ComfyUI_essentials/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..2b61bb8561bfe1231b08ad74247aec3986dfdefa
--- /dev/null
+++ b/ComfyUI_essentials/pyproject.toml
@@ -0,0 +1,15 @@
+[project]
+name = "comfyui_essentials"
+description = "Essential nodes that are weirdly missing from ComfyUI core. With few exceptions they are new features and not commodities."
+version = "1.1.0"
+license = { file = "LICENSE" }
+dependencies = ["numba", "colour-science", "rembg", "pixeloe"]
+
+[project.urls]
+Repository = "https://github.com/cubiq/ComfyUI_essentials"
+# Used by Comfy Registry https://comfyregistry.org
+
+[tool.comfy]
+PublisherId = "matteo"
+DisplayName = "ComfyUI_essentials"
+Icon = ""
diff --git a/ComfyUI_essentials/requirements.txt b/ComfyUI_essentials/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78b1f44611422a1179a0a5cfe2780faec09177ec
--- /dev/null
+++ b/ComfyUI_essentials/requirements.txt
@@ -0,0 +1,5 @@
+numba
+colour-science
+rembg
+pixeloe
+transparent-background
\ No newline at end of file
diff --git a/ComfyUI_essentials/sampling.py b/ComfyUI_essentials/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f7f27d8d87e46f85b51f9cb3a1335c904e1d2ba
--- /dev/null
+++ b/ComfyUI_essentials/sampling.py
@@ -0,0 +1,811 @@
+import os
+import comfy.samplers
+import comfy.sample
+import torch
+from nodes import common_ksampler, CLIPTextEncode
+from comfy.utils import ProgressBar
+from .utils import expand_mask, FONTS_DIR, parse_string_to_list
+import torchvision.transforms.v2 as T
+import torch.nn.functional as F
+import logging
+import folder_paths
+
+# From https://github.com/BlenderNeko/ComfyUI_Noise/
+def slerp(val, low, high):
+ dims = low.shape
+
+ low = low.reshape(dims[0], -1)
+ high = high.reshape(dims[0], -1)
+
+ low_norm = low/torch.norm(low, dim=1, keepdim=True)
+ high_norm = high/torch.norm(high, dim=1, keepdim=True)
+
+ low_norm[low_norm != low_norm] = 0.0
+ high_norm[high_norm != high_norm] = 0.0
+
+ omega = torch.acos((low_norm*high_norm).sum(1))
+ so = torch.sin(omega)
+ res = (torch.sin((1.0-val)*omega)/so).unsqueeze(1)*low + (torch.sin(val*omega)/so).unsqueeze(1) * high
+
+ return res.reshape(dims)
+
+class KSamplerVariationsWithNoise:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "model": ("MODEL", ),
+ "latent_image": ("LATENT", ),
+ "main_seed": ("INT:seed", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+ "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+ "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
+ "sampler_name": (comfy.samplers.KSampler.SAMPLERS, ),
+ "scheduler": (comfy.samplers.KSampler.SCHEDULERS, ),
+ "positive": ("CONDITIONING", ),
+ "negative": ("CONDITIONING", ),
+ "variation_strength": ("FLOAT", {"default": 0.17, "min": 0.0, "max": 1.0, "step":0.01, "round": 0.01}),
+ #"start_at_step": ("INT", {"default": 0, "min": 0, "max": 10000}),
+ #"end_at_step": ("INT", {"default": 10000, "min": 0, "max": 10000}),
+ #"return_with_leftover_noise": (["disable", "enable"], ),
+ "variation_seed": ("INT:seed", {"default": 12345, "min": 0, "max": 0xffffffffffffffff}),
+ "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step":0.01, "round": 0.01}),
+ }}
+
+ RETURN_TYPES = ("LATENT",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def prepare_mask(self, mask, shape):
+ mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(shape[2], shape[3]), mode="bilinear")
+ mask = mask.expand((-1,shape[1],-1,-1))
+ if mask.shape[0] < shape[0]:
+ mask = mask.repeat((shape[0] -1) // mask.shape[0] + 1, 1, 1, 1)[:shape[0]]
+ return mask
+
+ def execute(self, model, latent_image, main_seed, steps, cfg, sampler_name, scheduler, positive, negative, variation_strength, variation_seed, denoise):
+ if main_seed == variation_seed:
+ variation_seed += 1
+
+ end_at_step = steps #min(steps, end_at_step)
+ start_at_step = round(end_at_step - end_at_step * denoise)
+
+ force_full_denoise = True
+ disable_noise = True
+
+ device = comfy.model_management.get_torch_device()
+
+ # Generate base noise
+ batch_size, _, height, width = latent_image["samples"].shape
+ generator = torch.manual_seed(main_seed)
+ base_noise = torch.randn((1, 4, height, width), dtype=torch.float32, device="cpu", generator=generator).repeat(batch_size, 1, 1, 1).cpu()
+
+ # Generate variation noise
+ generator = torch.manual_seed(variation_seed)
+ variation_noise = torch.randn((batch_size, 4, height, width), dtype=torch.float32, device="cpu", generator=generator).cpu()
+
+ slerp_noise = slerp(variation_strength, base_noise, variation_noise)
+
+ # Calculate sigma
+ comfy.model_management.load_model_gpu(model)
+ sampler = comfy.samplers.KSampler(model, steps=steps, device=device, sampler=sampler_name, scheduler=scheduler, denoise=1.0, model_options=model.model_options)
+ sigmas = sampler.sigmas
+ sigma = sigmas[start_at_step] - sigmas[end_at_step]
+ sigma /= model.model.latent_format.scale_factor
+ sigma = sigma.detach().cpu().item()
+
+ work_latent = latent_image.copy()
+ work_latent["samples"] = latent_image["samples"].clone() + slerp_noise * sigma
+
+ # if there's a mask we need to expand it to avoid artifacts, 5 pixels should be enough
+ if "noise_mask" in latent_image:
+ noise_mask = self.prepare_mask(latent_image["noise_mask"], latent_image['samples'].shape)
+ work_latent["samples"] = noise_mask * work_latent["samples"] + (1-noise_mask) * latent_image["samples"]
+ work_latent['noise_mask'] = expand_mask(latent_image["noise_mask"].clone(), 5, True)
+
+ return common_ksampler(model, main_seed, steps, cfg, sampler_name, scheduler, positive, negative, work_latent, denoise=1.0, disable_noise=disable_noise, start_step=start_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise)
+
+
+class KSamplerVariationsStochastic:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{
+ "model": ("MODEL",),
+ "latent_image": ("LATENT", ),
+ "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+ "steps": ("INT", {"default": 25, "min": 1, "max": 10000}),
+ "cfg": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
+ "sampler": (comfy.samplers.KSampler.SAMPLERS, ),
+ "scheduler": (comfy.samplers.KSampler.SCHEDULERS, ),
+ "positive": ("CONDITIONING", ),
+ "negative": ("CONDITIONING", ),
+ "variation_seed": ("INT:seed", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+ "variation_strength": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 1.0, "step":0.05, "round": 0.01}),
+ #"variation_sampler": (comfy.samplers.KSampler.SAMPLERS, ),
+ "cfg_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step":0.05, "round": 0.01}),
+ }}
+
+ RETURN_TYPES = ("LATENT", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, model, latent_image, noise_seed, steps, cfg, sampler, scheduler, positive, negative, variation_seed, variation_strength, cfg_scale, variation_sampler="dpmpp_2m_sde"):
+ # Stage 1: composition sampler
+ force_full_denoise = False # return with leftover noise = "enable"
+ disable_noise = False # add noise = "enable"
+
+ end_at_step = max(int(steps * (1-variation_strength)), 1)
+ start_at_step = 0
+
+ work_latent = latent_image.copy()
+ batch_size = work_latent["samples"].shape[0]
+ work_latent["samples"] = work_latent["samples"][0].unsqueeze(0)
+
+ stage1 = common_ksampler(model, noise_seed, steps, cfg, sampler, scheduler, positive, negative, work_latent, denoise=1.0, disable_noise=disable_noise, start_step=start_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise)[0]
+
+ if batch_size > 1:
+ stage1["samples"] = stage1["samples"].clone().repeat(batch_size, 1, 1, 1)
+
+ # Stage 2: variation sampler
+ force_full_denoise = True
+ disable_noise = True
+ cfg = max(cfg * cfg_scale, 1.0)
+ start_at_step = end_at_step
+ end_at_step = steps
+
+ return common_ksampler(model, variation_seed, steps, cfg, variation_sampler, scheduler, positive, negative, stage1, denoise=1.0, disable_noise=disable_noise, start_step=start_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise)
+
+class InjectLatentNoise:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "latent": ("LATENT", ),
+ "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+ "noise_strength": ("FLOAT", {"default": 1.0, "min": -20.0, "max": 20.0, "step":0.01, "round": 0.01}),
+ "normalize": (["false", "true"], {"default": "false"}),
+ },
+ "optional": {
+ "mask": ("MASK", ),
+ }}
+
+ RETURN_TYPES = ("LATENT",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, latent, noise_seed, noise_strength, normalize="false", mask=None):
+ torch.manual_seed(noise_seed)
+ noise_latent = latent.copy()
+ original_samples = noise_latent["samples"].clone()
+ random_noise = torch.randn_like(original_samples)
+
+ if normalize == "true":
+ mean = original_samples.mean()
+ std = original_samples.std()
+ random_noise = random_noise * std + mean
+
+ random_noise = original_samples + random_noise * noise_strength
+
+ if mask is not None:
+ mask = F.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(random_noise.shape[2], random_noise.shape[3]), mode="bilinear")
+ mask = mask.expand((-1,random_noise.shape[1],-1,-1)).clamp(0.0, 1.0)
+ if mask.shape[0] < random_noise.shape[0]:
+ mask = mask.repeat((random_noise.shape[0] -1) // mask.shape[0] + 1, 1, 1, 1)[:random_noise.shape[0]]
+ elif mask.shape[0] > random_noise.shape[0]:
+ mask = mask[:random_noise.shape[0]]
+ random_noise = mask * random_noise + (1-mask) * original_samples
+
+ noise_latent["samples"] = random_noise
+
+ return (noise_latent, )
+
+class TextEncodeForSamplerParams:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "text": ("STRING", {"multiline": True, "dynamicPrompts": True, "default": "Separate prompts with at least three dashes\n---\nLike so"}),
+ "clip": ("CLIP", )
+ }}
+
+ RETURN_TYPES = ("CONDITIONING", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, text, clip):
+ import re
+ output_text = []
+ output_encoded = []
+ text = re.sub(r'[-*=~]{4,}\n', '---\n', text)
+ text = text.split("---\n")
+
+ for t in text:
+ t = t.strip()
+ if t:
+ output_text.append(t)
+ output_encoded.append(CLIPTextEncode().encode(clip, t)[0])
+
+ #if len(output_encoded) == 1:
+ # output = output_encoded[0]
+ #else:
+ output = {"text": output_text, "encoded": output_encoded}
+
+ return (output, )
+
+class SamplerSelectHelper:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ **{s: ("BOOLEAN", { "default": False }) for s in comfy.samplers.KSampler.SAMPLERS},
+ }}
+
+ RETURN_TYPES = ("STRING", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, **values):
+ values = [v for v in values if values[v]]
+ values = ", ".join(values)
+
+ return (values, )
+
+class SchedulerSelectHelper:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ **{s: ("BOOLEAN", { "default": False }) for s in comfy.samplers.KSampler.SCHEDULERS},
+ }}
+
+ RETURN_TYPES = ("STRING", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, **values):
+ values = [v for v in values if values[v]]
+ values = ", ".join(values)
+
+ return (values, )
+
+class LorasForFluxParams:
+ @classmethod
+ def INPUT_TYPES(s):
+ optional_loras = ['none'] + folder_paths.get_filename_list("loras")
+ return {
+ "required": {
+ "lora_1": (folder_paths.get_filename_list("loras"), {"tooltip": "The name of the LoRA."}),
+ "strength_model_1": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "1.0" }),
+ },
+ #"optional": {
+ # "lora_2": (optional_loras, ),
+ # "strength_lora_2": ("STRING", { "multiline": False, "dynamicPrompts": False }),
+ # "lora_3": (optional_loras, ),
+ # "strength_lora_3": ("STRING", { "multiline": False, "dynamicPrompts": False }),
+ # "lora_4": (optional_loras, ),
+ # "strength_lora_4": ("STRING", { "multiline": False, "dynamicPrompts": False }),
+ #}
+ }
+
+ RETURN_TYPES = ("LORA_PARAMS", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, lora_1, strength_model_1, lora_2="none", strength_lora_2="", lora_3="none", strength_lora_3="", lora_4="none", strength_lora_4=""):
+ output = { "loras": [], "strengths": [] }
+ output["loras"].append(lora_1)
+ output["strengths"].append(parse_string_to_list(strength_model_1))
+
+ if lora_2 != "none":
+ output["loras"].append(lora_2)
+ if strength_lora_2 == "":
+ strength_lora_2 = "1.0"
+ output["strengths"].append(parse_string_to_list(strength_lora_2))
+ if lora_3 != "none":
+ output["loras"].append(lora_3)
+ if strength_lora_3 == "":
+ strength_lora_3 = "1.0"
+ output["strengths"].append(parse_string_to_list(strength_lora_3))
+ if lora_4 != "none":
+ output["loras"].append(lora_4)
+ if strength_lora_4 == "":
+ strength_lora_4 = "1.0"
+ output["strengths"].append(parse_string_to_list(strength_lora_4))
+
+ return (output,)
+
+
+class FluxSamplerParams:
+ def __init__(self):
+ self.loraloader = None
+ self.lora = (None, None)
+
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "model": ("MODEL", ),
+ "conditioning": ("CONDITIONING", ),
+ "latent_image": ("LATENT", ),
+
+ "seed": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "?" }),
+ "sampler": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "euler" }),
+ "scheduler": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "simple" }),
+ "steps": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "20" }),
+ "guidance": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "3.5" }),
+ "max_shift": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "" }),
+ "base_shift": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "" }),
+ "denoise": ("STRING", { "multiline": False, "dynamicPrompts": False, "default": "1.0" }),
+ },
+ "optional": {
+ "loras": ("LORA_PARAMS",),
+ }}
+
+ RETURN_TYPES = ("LATENT","SAMPLER_PARAMS")
+ RETURN_NAMES = ("latent", "params")
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, model, conditioning, latent_image, seed, sampler, scheduler, steps, guidance, max_shift, base_shift, denoise, loras=None):
+ import random
+ import time
+ from comfy_extras.nodes_custom_sampler import Noise_RandomNoise, BasicScheduler, BasicGuider, SamplerCustomAdvanced
+ from comfy_extras.nodes_latent import LatentBatch
+ from comfy_extras.nodes_model_advanced import ModelSamplingFlux, ModelSamplingAuraFlow
+ from node_helpers import conditioning_set_values
+ from nodes import LoraLoader
+
+ is_schnell = model.model.model_type == comfy.model_base.ModelType.FLOW
+
+ noise = seed.replace("\n", ",").split(",")
+ noise = [random.randint(0, 999999) if "?" in n else int(n) for n in noise]
+ if not noise:
+ noise = [random.randint(0, 999999)]
+
+ if sampler == '*':
+ sampler = comfy.samplers.KSampler.SAMPLERS
+ elif sampler.startswith("!"):
+ sampler = sampler.replace("\n", ",").split(",")
+ sampler = [s.strip("! ") for s in sampler]
+ sampler = [s for s in comfy.samplers.KSampler.SAMPLERS if s not in sampler]
+ else:
+ sampler = sampler.replace("\n", ",").split(",")
+ sampler = [s.strip() for s in sampler if s.strip() in comfy.samplers.KSampler.SAMPLERS]
+ if not sampler:
+ sampler = ['ipndm']
+
+ if scheduler == '*':
+ scheduler = comfy.samplers.KSampler.SCHEDULERS
+ elif scheduler.startswith("!"):
+ scheduler = scheduler.replace("\n", ",").split(",")
+ scheduler = [s.strip("! ") for s in scheduler]
+ scheduler = [s for s in comfy.samplers.KSampler.SCHEDULERS if s not in scheduler]
+ else:
+ scheduler = scheduler.replace("\n", ",").split(",")
+ scheduler = [s.strip() for s in scheduler]
+ scheduler = [s for s in scheduler if s in comfy.samplers.KSampler.SCHEDULERS]
+ if not scheduler:
+ scheduler = ['simple']
+
+ if steps == "":
+ if is_schnell:
+ steps = "4"
+ else:
+ steps = "20"
+ steps = parse_string_to_list(steps)
+
+ denoise = "1.0" if denoise == "" else denoise
+ denoise = parse_string_to_list(denoise)
+
+ guidance = "3.5" if guidance == "" else guidance
+ guidance = parse_string_to_list(guidance)
+
+ if not is_schnell:
+ max_shift = "1.15" if max_shift == "" else max_shift
+ base_shift = "0.5" if base_shift == "" else base_shift
+ else:
+ max_shift = "0"
+ base_shift = "1.0" if base_shift == "" else base_shift
+
+ max_shift = parse_string_to_list(max_shift)
+ base_shift = parse_string_to_list(base_shift)
+
+ cond_text = None
+ if isinstance(conditioning, dict) and "encoded" in conditioning:
+ cond_text = conditioning["text"]
+ cond_encoded = conditioning["encoded"]
+ else:
+ cond_encoded = [conditioning]
+
+ out_latent = None
+ out_params = []
+
+ basicschedueler = BasicScheduler()
+ basicguider = BasicGuider()
+ samplercustomadvanced = SamplerCustomAdvanced()
+ latentbatch = LatentBatch()
+ modelsamplingflux = ModelSamplingFlux() if not is_schnell else ModelSamplingAuraFlow()
+ width = latent_image["samples"].shape[3]*8
+ height = latent_image["samples"].shape[2]*8
+
+ lora_strength_len = 1
+ if loras:
+ lora_model = loras["loras"]
+ lora_strength = loras["strengths"]
+ lora_strength_len = sum(len(i) for i in lora_strength)
+
+ if self.loraloader is None:
+ self.loraloader = LoraLoader()
+
+ # count total number of samples
+ total_samples = len(cond_encoded) * len(noise) * len(max_shift) * len(base_shift) * len(guidance) * len(sampler) * len(scheduler) * len(steps) * len(denoise) * lora_strength_len
+ current_sample = 0
+ if total_samples > 1:
+ pbar = ProgressBar(total_samples)
+
+ lora_strength_len = 1
+ if loras:
+ lora_strength_len = len(lora_strength[0])
+
+ for los in range(lora_strength_len):
+ if loras:
+ patched_model = self.loraloader.load_lora(model, None, lora_model[0], lora_strength[0][los], 0)[0]
+ else:
+ patched_model = model
+
+ for i in range(len(cond_encoded)):
+ conditioning = cond_encoded[i]
+ ct = cond_text[i] if cond_text else None
+ for n in noise:
+ randnoise = Noise_RandomNoise(n)
+ for ms in max_shift:
+ for bs in base_shift:
+ if is_schnell:
+ work_model = modelsamplingflux.patch_aura(patched_model, bs)[0]
+ else:
+ work_model = modelsamplingflux.patch(patched_model, ms, bs, width, height)[0]
+ for g in guidance:
+ cond = conditioning_set_values(conditioning, {"guidance": g})
+ guider = basicguider.get_guider(work_model, cond)[0]
+ for s in sampler:
+ samplerobj = comfy.samplers.sampler_object(s)
+ for sc in scheduler:
+ for st in steps:
+ for d in denoise:
+ sigmas = basicschedueler.get_sigmas(work_model, sc, st, d)[0]
+ current_sample += 1
+ log = f"Sampling {current_sample}/{total_samples} with seed {n}, sampler {s}, scheduler {sc}, steps {st}, guidance {g}, max_shift {ms}, base_shift {bs}, denoise {d}"
+ lora_name = None
+ lora_str = 0
+ if loras:
+ lora_name = lora_model[0]
+ lora_str = lora_strength[0][los]
+ log += f", lora {lora_name}, lora_strength {lora_str}"
+ logging.info(log)
+ start_time = time.time()
+ latent = samplercustomadvanced.sample(randnoise, guider, samplerobj, sigmas, latent_image)[1]
+ elapsed_time = time.time() - start_time
+ out_params.append({"time": elapsed_time,
+ "seed": n,
+ "width": width,
+ "height": height,
+ "sampler": s,
+ "scheduler": sc,
+ "steps": st,
+ "guidance": g,
+ "max_shift": ms,
+ "base_shift": bs,
+ "denoise": d,
+ "prompt": ct,
+ "lora": lora_name,
+ "lora_strength": lora_str})
+
+ if out_latent is None:
+ out_latent = latent
+ else:
+ out_latent = latentbatch.batch(out_latent, latent)[0]
+ if total_samples > 1:
+ pbar.update(1)
+
+ return (out_latent, out_params)
+
+class PlotParameters:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": {
+ "images": ("IMAGE", ),
+ "params": ("SAMPLER_PARAMS", ),
+ "order_by": (["none", "time", "seed", "steps", "denoise", "sampler", "scheduler", "guidance", "max_shift", "base_shift", "lora_strength"], ),
+ "cols_value": (["none", "time", "seed", "steps", "denoise", "sampler", "scheduler", "guidance", "max_shift", "base_shift", "lora_strength"], ),
+ "cols_num": ("INT", {"default": -1, "min": -1, "max": 1024 }),
+ "add_prompt": (["false", "true", "excerpt"], ),
+ "add_params": (["false", "true", "changes only"], {"default": "true"}),
+ }}
+
+ RETURN_TYPES = ("IMAGE", )
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, images, params, order_by, cols_value, cols_num, add_prompt, add_params):
+ from PIL import Image, ImageDraw, ImageFont
+ import math
+ import textwrap
+
+ if images.shape[0] != len(params):
+ raise ValueError("Number of images and number of parameters do not match.")
+
+ _params = params.copy()
+
+ if order_by != "none":
+ sorted_params = sorted(_params, key=lambda x: x[order_by])
+ indices = [_params.index(item) for item in sorted_params]
+ images = images[torch.tensor(indices)]
+ _params = sorted_params
+
+ if cols_value != "none" and cols_num > -1:
+ groups = {}
+ for p in _params:
+ value = p[cols_value]
+ if value not in groups:
+ groups[value] = []
+ groups[value].append(p)
+ cols_num = len(groups)
+
+ sorted_params = []
+ groups = list(groups.values())
+ for g in zip(*groups):
+ sorted_params.extend(g)
+
+ indices = [_params.index(item) for item in sorted_params]
+ images = images[torch.tensor(indices)]
+ _params = sorted_params
+ elif cols_num == 0:
+ cols_num = int(math.sqrt(images.shape[0]))
+ cols_num = max(1, min(cols_num, 1024))
+
+ width = images.shape[2]
+ out_image = []
+
+ font = ImageFont.truetype(os.path.join(FONTS_DIR, 'ShareTechMono-Regular.ttf'), min(48, int(32*(width/1024))))
+ text_padding = 3
+ line_height = font.getmask('Q').getbbox()[3] + font.getmetrics()[1] + text_padding*2
+ char_width = font.getbbox('M')[2]+1 # using monospace font
+
+ if add_params == "changes only":
+ value_tracker = {}
+ for p in _params:
+ for key, value in p.items():
+ if key != "time":
+ if key not in value_tracker:
+ value_tracker[key] = set()
+ value_tracker[key].add(value)
+ changing_keys = {key for key, values in value_tracker.items() if len(values) > 1 or key == "prompt"}
+
+ result = []
+ for p in _params:
+ changing_params = {key: value for key, value in p.items() if key in changing_keys}
+ result.append(changing_params)
+
+ _params = result
+
+ for (image, param) in zip(images, _params):
+ image = image.permute(2, 0, 1)
+
+ if add_params != "false":
+ if add_params == "changes only":
+ text = "\n".join([f"{key}: {value}" for key, value in param.items() if key != "prompt"])
+ else:
+ text = f"time: {param['time']:.2f}s, seed: {param['seed']}, steps: {param['steps']}, size: {param['width']}×{param['height']}\ndenoise: {param['denoise']}, sampler: {param['sampler']}, sched: {param['scheduler']}\nguidance: {param['guidance']}, max/base shift: {param['max_shift']}/{param['base_shift']}"
+ if 'lora' in param and param['lora']:
+ text += f"\nLoRA: {param['lora'][:32]}, str: {param['lora_strength']}"
+
+ lines = text.split("\n")
+ text_height = line_height * len(lines)
+ text_image = Image.new('RGB', (width, text_height), color=(0, 0, 0))
+
+ for i, line in enumerate(lines):
+ draw = ImageDraw.Draw(text_image)
+ draw.text((text_padding, i * line_height + text_padding), line, font=font, fill=(255, 255, 255))
+
+ text_image = T.ToTensor()(text_image).to(image.device)
+ image = torch.cat([image, text_image], 1)
+
+ if 'prompt' in param and param['prompt'] and add_prompt != "false":
+ prompt = param['prompt']
+ if add_prompt == "excerpt":
+ prompt = " ".join(param['prompt'].split()[:64])
+ prompt += "..."
+
+ cols = math.ceil(width / char_width)
+ prompt_lines = textwrap.wrap(prompt, width=cols)
+ prompt_height = line_height * len(prompt_lines)
+ prompt_image = Image.new('RGB', (width, prompt_height), color=(0, 0, 0))
+
+ for i, line in enumerate(prompt_lines):
+ draw = ImageDraw.Draw(prompt_image)
+ draw.text((text_padding, i * line_height + text_padding), line, font=font, fill=(255, 255, 255))
+
+ prompt_image = T.ToTensor()(prompt_image).to(image.device)
+ image = torch.cat([image, prompt_image], 1)
+
+ # a little cleanup
+ image = torch.nan_to_num(image, nan=0.0).clamp(0.0, 1.0)
+ out_image.append(image)
+
+ # ensure all images have the same height
+ if add_prompt != "false" or add_params == "changes only":
+ max_height = max([image.shape[1] for image in out_image])
+ out_image = [F.pad(image, (0, 0, 0, max_height - image.shape[1])) for image in out_image]
+
+ out_image = torch.stack(out_image, 0).permute(0, 2, 3, 1)
+
+ # merge images
+ if cols_num > -1:
+ cols = min(cols_num, out_image.shape[0])
+ b, h, w, c = out_image.shape
+ rows = math.ceil(b / cols)
+
+ # Pad the tensor if necessary
+ if b % cols != 0:
+ padding = cols - (b % cols)
+ out_image = F.pad(out_image, (0, 0, 0, 0, 0, 0, 0, padding))
+ b = out_image.shape[0]
+
+ # Reshape and transpose
+ out_image = out_image.reshape(rows, cols, h, w, c)
+ out_image = out_image.permute(0, 2, 1, 3, 4)
+ out_image = out_image.reshape(rows * h, cols * w, c).unsqueeze(0)
+
+ """
+ width = out_image.shape[2]
+ # add the title and notes on top
+ if title and export_labels:
+ title_font = ImageFont.truetype(os.path.join(FONTS_DIR, 'ShareTechMono-Regular.ttf'), 48)
+ title_width = title_font.getbbox(title)[2]
+ title_padding = 6
+ title_line_height = title_font.getmask(title).getbbox()[3] + title_font.getmetrics()[1] + title_padding*2
+ title_text_height = title_line_height
+ title_text_image = Image.new('RGB', (width, title_text_height), color=(0, 0, 0, 0))
+
+ draw = ImageDraw.Draw(title_text_image)
+ draw.text((width//2 - title_width//2, title_padding), title, font=title_font, fill=(255, 255, 255))
+
+ title_text_image = T.ToTensor()(title_text_image).unsqueeze(0).permute([0,2,3,1]).to(out_image.device)
+ out_image = torch.cat([title_text_image, out_image], 1)
+ """
+
+ return (out_image, )
+
+class GuidanceTimestepping:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "model": ("MODEL",),
+ "value": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 100.0, "step": 0.05}),
+ "start_at": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.01}),
+ "end_at": ("FLOAT", {"default": 0.8, "min": 0.0, "max": 1.0, "step": 0.01}),
+ }
+ }
+
+ RETURN_TYPES = ("MODEL",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, model, value, start_at, end_at):
+ sigma_start = model.get_model_object("model_sampling").percent_to_sigma(start_at)
+ sigma_end = model.get_model_object("model_sampling").percent_to_sigma(end_at)
+
+ def apply_apg(args):
+ cond = args["cond"]
+ uncond = args["uncond"]
+ cond_scale = args["cond_scale"]
+ sigma = args["sigma"]
+
+ sigma = sigma.detach().cpu()[0].item()
+
+ if sigma <= sigma_start and sigma > sigma_end:
+ cond_scale = value
+
+ return uncond + (cond - uncond) * cond_scale
+
+ m = model.clone()
+ m.set_model_sampler_cfg_function(apply_apg)
+ return (m,)
+
+class ModelSamplingDiscreteFlowCustom(torch.nn.Module):
+ def __init__(self, model_config=None):
+ super().__init__()
+ if model_config is not None:
+ sampling_settings = model_config.sampling_settings
+ else:
+ sampling_settings = {}
+
+ self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000))
+
+ def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000, cut_off=1.0, shift_multiplier=0):
+ self.shift = shift
+ self.multiplier = multiplier
+ self.cut_off = cut_off
+ self.shift_multiplier = shift_multiplier
+ ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier)
+ self.register_buffer('sigmas', ts)
+
+ @property
+ def sigma_min(self):
+ return self.sigmas[0]
+
+ @property
+ def sigma_max(self):
+ return self.sigmas[-1]
+
+ def timestep(self, sigma):
+ return sigma * self.multiplier
+
+ def sigma(self, timestep):
+ shift = self.shift
+ if timestep.dim() == 0:
+ t = timestep.cpu().item() / self.multiplier
+ if t <= self.cut_off:
+ shift = shift * self.shift_multiplier
+
+ return comfy.model_sampling.time_snr_shift(shift, timestep / self.multiplier)
+
+ def percent_to_sigma(self, percent):
+ if percent <= 0.0:
+ return 1.0
+ if percent >= 1.0:
+ return 0.0
+ return 1.0 - percent
+
+class ModelSamplingSD3Advanced:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": { "model": ("MODEL",),
+ "shift": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 100.0, "step":0.01}),
+ "cut_off": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step":0.05}),
+ "shift_multiplier": ("FLOAT", {"default": 2, "min": 0, "max": 10, "step":0.05}),
+ }}
+
+ RETURN_TYPES = ("MODEL",)
+ FUNCTION = "execute"
+
+ CATEGORY = "essentials/sampling"
+
+ def execute(self, model, shift, multiplier=1000, cut_off=1.0, shift_multiplier=0):
+ m = model.clone()
+
+
+ sampling_base = ModelSamplingDiscreteFlowCustom
+ sampling_type = comfy.model_sampling.CONST
+
+ class ModelSamplingAdvanced(sampling_base, sampling_type):
+ pass
+
+ model_sampling = ModelSamplingAdvanced(model.model.model_config)
+ model_sampling.set_parameters(shift=shift, multiplier=multiplier, cut_off=cut_off, shift_multiplier=shift_multiplier)
+ m.add_object_patch("model_sampling", model_sampling)
+
+ return (m, )
+
+SAMPLING_CLASS_MAPPINGS = {
+ "KSamplerVariationsStochastic+": KSamplerVariationsStochastic,
+ "KSamplerVariationsWithNoise+": KSamplerVariationsWithNoise,
+ "InjectLatentNoise+": InjectLatentNoise,
+ "FluxSamplerParams+": FluxSamplerParams,
+ "GuidanceTimestepping+": GuidanceTimestepping,
+ "PlotParameters+": PlotParameters,
+ "TextEncodeForSamplerParams+": TextEncodeForSamplerParams,
+ "SamplerSelectHelper+": SamplerSelectHelper,
+ "SchedulerSelectHelper+": SchedulerSelectHelper,
+ "LorasForFluxParams+": LorasForFluxParams,
+ "ModelSamplingSD3Advanced+": ModelSamplingSD3Advanced,
+}
+
+SAMPLING_NAME_MAPPINGS = {
+ "KSamplerVariationsStochastic+": "🔧 KSampler Stochastic Variations",
+ "KSamplerVariationsWithNoise+": "🔧 KSampler Variations with Noise Injection",
+ "InjectLatentNoise+": "🔧 Inject Latent Noise",
+ "FluxSamplerParams+": "🔧 Flux Sampler Parameters",
+ "GuidanceTimestepping+": "🔧 Guidance Timestep (experimental)",
+ "PlotParameters+": "🔧 Plot Sampler Parameters",
+ "TextEncodeForSamplerParams+": "🔧Text Encode for Sampler Params",
+ "SamplerSelectHelper+": "🔧 Sampler Select Helper",
+ "SchedulerSelectHelper+": "🔧 Scheduler Select Helper",
+ "LorasForFluxParams+": "🔧 LoRA for Flux Parameters",
+ "ModelSamplingSD3Advanced+": "🔧 Model Sampling SD3 Advanced",
+}
\ No newline at end of file
diff --git a/ComfyUI_essentials/segmentation.py b/ComfyUI_essentials/segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe6caacb11f31ccffcda3bdbb0d282dbe7989b06
--- /dev/null
+++ b/ComfyUI_essentials/segmentation.py
@@ -0,0 +1,89 @@
+import torch
+import torchvision.transforms.v2 as T
+import torch.nn.functional as F
+from .utils import expand_mask
+
+class LoadCLIPSegModels:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {},
+ }
+
+ RETURN_TYPES = ("CLIP_SEG",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/segmentation"
+
+ def execute(self):
+ from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+ processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+ model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+
+ return ((processor, model),)
+
+class ApplyCLIPSeg:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "clip_seg": ("CLIP_SEG",),
+ "image": ("IMAGE",),
+ "prompt": ("STRING", { "multiline": False, "default": "" }),
+ "threshold": ("FLOAT", { "default": 0.4, "min": 0.0, "max": 1.0, "step": 0.05 }),
+ "smooth": ("INT", { "default": 9, "min": 0, "max": 32, "step": 1 }),
+ "dilate": ("INT", { "default": 0, "min": -32, "max": 32, "step": 1 }),
+ "blur": ("INT", { "default": 0, "min": 0, "max": 64, "step": 1 }),
+ },
+ }
+
+ RETURN_TYPES = ("MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/segmentation"
+
+ def execute(self, image, clip_seg, prompt, threshold, smooth, dilate, blur):
+ processor, model = clip_seg
+
+ imagenp = image.mul(255).clamp(0, 255).byte().cpu().numpy()
+
+ outputs = []
+ for i in imagenp:
+ inputs = processor(text=prompt, images=[i], return_tensors="pt")
+ out = model(**inputs)
+ out = out.logits.unsqueeze(1)
+ out = torch.sigmoid(out[0][0])
+ out = (out > threshold)
+ outputs.append(out)
+
+ del imagenp
+
+ outputs = torch.stack(outputs, dim=0)
+
+ if smooth > 0:
+ if smooth % 2 == 0:
+ smooth += 1
+ outputs = T.functional.gaussian_blur(outputs, smooth)
+
+ outputs = outputs.float()
+
+ if dilate != 0:
+ outputs = expand_mask(outputs, dilate, True)
+
+ if blur > 0:
+ if blur % 2 == 0:
+ blur += 1
+ outputs = T.functional.gaussian_blur(outputs, blur)
+
+ # resize to original size
+ outputs = F.interpolate(outputs.unsqueeze(1), size=(image.shape[1], image.shape[2]), mode='bicubic').squeeze(1)
+
+ return (outputs,)
+
+SEG_CLASS_MAPPINGS = {
+ "ApplyCLIPSeg+": ApplyCLIPSeg,
+ "LoadCLIPSegModels+": LoadCLIPSegModels,
+}
+
+SEG_NAME_MAPPINGS = {
+ "ApplyCLIPSeg+": "🔧 Apply CLIPSeg",
+ "LoadCLIPSegModels+": "🔧 Load CLIPSeg Models",
+}
\ No newline at end of file
diff --git a/ComfyUI_essentials/text.py b/ComfyUI_essentials/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c2549859dde3263a2762cbbdd7b82f430aedb7
--- /dev/null
+++ b/ComfyUI_essentials/text.py
@@ -0,0 +1,113 @@
+import os
+import torch
+from nodes import MAX_RESOLUTION
+import torchvision.transforms.v2 as T
+from .utils import FONTS_DIR
+
+class DrawText:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "text": ("STRING", { "multiline": True, "dynamicPrompts": True, "default": "Hello, World!" }),
+ "font": (sorted([f for f in os.listdir(FONTS_DIR) if f.endswith('.ttf') or f.endswith('.otf')]), ),
+ "size": ("INT", { "default": 56, "min": 1, "max": 9999, "step": 1 }),
+ "color": ("STRING", { "multiline": False, "default": "#FFFFFF" }),
+ "background_color": ("STRING", { "multiline": False, "default": "#00000000" }),
+ "shadow_distance": ("INT", { "default": 0, "min": 0, "max": 100, "step": 1 }),
+ "shadow_blur": ("INT", { "default": 0, "min": 0, "max": 100, "step": 1 }),
+ "shadow_color": ("STRING", { "multiline": False, "default": "#000000" }),
+ "horizontal_align": (["left", "center", "right"],),
+ "vertical_align": (["top", "center", "bottom"],),
+ "offset_x": ("INT", { "default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1 }),
+ "offset_y": ("INT", { "default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1 }),
+ "direction": (["ltr", "rtl"],),
+ },
+ "optional": {
+ "img_composite": ("IMAGE",),
+ },
+ }
+
+ RETURN_TYPES = ("IMAGE", "MASK",)
+ FUNCTION = "execute"
+ CATEGORY = "essentials/text"
+
+ def execute(self, text, font, size, color, background_color, shadow_distance, shadow_blur, shadow_color, horizontal_align, vertical_align, offset_x, offset_y, direction, img_composite=None):
+ from PIL import Image, ImageDraw, ImageFont, ImageColor, ImageFilter
+
+ font = ImageFont.truetype(os.path.join(FONTS_DIR, font), size)
+
+ lines = text.split("\n")
+ if direction == "rtl":
+ lines = [line[::-1] for line in lines]
+
+ # Calculate the width and height of the text
+ text_width = max(font.getbbox(line)[2] for line in lines)
+ line_height = font.getmask(text).getbbox()[3] + font.getmetrics()[1] # add descent to height
+ text_height = line_height * len(lines)
+
+ if img_composite is not None:
+ img_composite = T.ToPILImage()(img_composite.permute([0,3,1,2])[0]).convert('RGBA')
+ width = img_composite.width
+ height = img_composite.height
+ image = Image.new('RGBA', (width, height), color=background_color)
+ else:
+ width = text_width
+ height = text_height
+ background_color = ImageColor.getrgb(background_color)
+ image = Image.new('RGBA', (width + shadow_distance, height + shadow_distance), color=background_color)
+
+ image_shadow = None
+ if shadow_distance > 0:
+ image_shadow = image.copy()
+ #image_shadow = Image.new('RGBA', (width + shadow_distance, height + shadow_distance), color=background_color)
+
+ for i, line in enumerate(lines):
+ line_width = font.getbbox(line)[2]
+ #text_height =font.getbbox(line)[3]
+ if horizontal_align == "left":
+ x = 0
+ elif horizontal_align == "center":
+ x = (width - line_width) / 2
+ elif horizontal_align == "right":
+ x = width - line_width
+
+ if vertical_align == "top":
+ y = 0
+ elif vertical_align == "center":
+ y = (height - text_height) / 2
+ elif vertical_align == "bottom":
+ y = height - text_height
+
+ x += offset_x
+ y += i * line_height + offset_y
+
+ draw = ImageDraw.Draw(image)
+ draw.text((x, y), line, font=font, fill=color)
+
+ if image_shadow is not None:
+ draw = ImageDraw.Draw(image_shadow)
+ draw.text((x + shadow_distance, y + shadow_distance), line, font=font, fill=shadow_color)
+
+ if image_shadow is not None:
+ image_shadow = image_shadow.filter(ImageFilter.GaussianBlur(shadow_blur))
+ image = Image.alpha_composite(image_shadow, image)
+
+ #image = T.ToTensor()(image).unsqueeze(0).permute([0,2,3,1])
+ mask = T.ToTensor()(image).unsqueeze(0).permute([0,2,3,1])
+ mask = mask[:, :, :, 3] if mask.shape[3] == 4 else torch.ones_like(mask[:, :, :, 0])
+
+ if img_composite is not None:
+ image = Image.alpha_composite(img_composite, image)
+
+ image = T.ToTensor()(image).unsqueeze(0).permute([0,2,3,1])
+
+ return (image[:, :, :, :3], mask,)
+
+TEXT_CLASS_MAPPINGS = {
+ "DrawText+": DrawText,
+}
+
+TEXT_NAME_MAPPINGS = {
+ "DrawText+": "🔧 Draw Text",
+}
\ No newline at end of file
diff --git a/ComfyUI_essentials/utils.py b/ComfyUI_essentials/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6c89d02b1ffc4ec7526edb9743b9c11e7a3d26
--- /dev/null
+++ b/ComfyUI_essentials/utils.py
@@ -0,0 +1,89 @@
+import torch
+import numpy as np
+import scipy
+import os
+#import re
+from pathlib import Path
+import folder_paths
+
+FONTS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fonts")
+
+SCRIPT_DIR = Path(__file__).parent
+folder_paths.add_model_folder_path("luts", (SCRIPT_DIR / "luts").as_posix())
+folder_paths.add_model_folder_path(
+ "luts", (Path(folder_paths.models_dir) / "luts").as_posix()
+)
+
+# from https://github.com/pythongosssss/ComfyUI-Custom-Scripts
+class AnyType(str):
+ def __ne__(self, __value: object) -> bool:
+ return False
+
+def min_(tensor_list):
+ # return the element-wise min of the tensor list.
+ x = torch.stack(tensor_list)
+ mn = x.min(axis=0)[0]
+ return torch.clamp(mn, min=0)
+
+def max_(tensor_list):
+ # return the element-wise max of the tensor list.
+ x = torch.stack(tensor_list)
+ mx = x.max(axis=0)[0]
+ return torch.clamp(mx, max=1)
+
+def expand_mask(mask, expand, tapered_corners):
+ c = 0 if tapered_corners else 1
+ kernel = np.array([[c, 1, c],
+ [1, 1, 1],
+ [c, 1, c]])
+ mask = mask.reshape((-1, mask.shape[-2], mask.shape[-1]))
+ out = []
+ for m in mask:
+ output = m.numpy()
+ for _ in range(abs(expand)):
+ if expand < 0:
+ output = scipy.ndimage.grey_erosion(output, footprint=kernel)
+ else:
+ output = scipy.ndimage.grey_dilation(output, footprint=kernel)
+ output = torch.from_numpy(output)
+ out.append(output)
+
+ return torch.stack(out, dim=0)
+
+def parse_string_to_list(s):
+ elements = s.split(',')
+ result = []
+
+ def parse_number(s):
+ try:
+ if '.' in s:
+ return float(s)
+ else:
+ return int(s)
+ except ValueError:
+ return 0
+
+ def decimal_places(s):
+ if '.' in s:
+ return len(s.split('.')[1])
+ return 0
+
+ for element in elements:
+ element = element.strip()
+ if '...' in element:
+ start, rest = element.split('...')
+ end, step = rest.split('+')
+ decimals = decimal_places(step)
+ start = parse_number(start)
+ end = parse_number(end)
+ step = parse_number(step)
+ current = start
+ if (start > end and step > 0) or (start < end and step < 0):
+ step = -step
+ while current <= end:
+ result.append(round(current, decimals))
+ current += step
+ else:
+ result.append(round(parse_number(element), decimal_places(element)))
+
+ return result
\ No newline at end of file
diff --git a/ComfyUI_essentials/workflow_all_nodes.json b/ComfyUI_essentials/workflow_all_nodes.json
new file mode 100644
index 0000000000000000000000000000000000000000..b23059ebb61d5b716a079bea2e663a5e5e04f38e
--- /dev/null
+++ b/ComfyUI_essentials/workflow_all_nodes.json
@@ -0,0 +1,994 @@
+{
+ "last_node_id": 42,
+ "last_link_id": 61,
+ "nodes": [
+ {
+ "id": 9,
+ "type": "ConsoleDebug+",
+ "pos": [
+ 720,
+ 140
+ ],
+ "size": {
+ "0": 210,
+ "1": 60
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "value",
+ "type": "*",
+ "link": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ConsoleDebug+"
+ },
+ "widgets_values": [
+ "Height:"
+ ]
+ },
+ {
+ "id": 28,
+ "type": "PreviewImage",
+ "pos": [
+ 860,
+ 1180
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 23
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 12,
+ "type": "PreviewImage",
+ "pos": [
+ 860,
+ 580
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 11
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 14,
+ "type": "PreviewImage",
+ "pos": [
+ 860,
+ 880
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 13
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 18,
+ "type": "MaskPreview+",
+ "pos": [
+ 2100,
+ 90
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 19
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskPreview+"
+ }
+ },
+ {
+ "id": 1,
+ "type": "GetImageSize+",
+ "pos": [
+ 450,
+ 80
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1
+ }
+ ],
+ "outputs": [
+ {
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 2
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 3
+ ],
+ "shape": 3,
+ "slot_index": 1
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize+"
+ }
+ },
+ {
+ "id": 8,
+ "type": "ConsoleDebug+",
+ "pos": [
+ 720,
+ 40
+ ],
+ "size": {
+ "0": 210,
+ "1": 60
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "value",
+ "type": "*",
+ "link": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ConsoleDebug+"
+ },
+ "widgets_values": [
+ "Width:"
+ ]
+ },
+ {
+ "id": 10,
+ "type": "PreviewImage",
+ "pos": [
+ 860,
+ 280
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 9
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 36,
+ "type": "SimpleMath+",
+ "pos": [
+ 1650,
+ 780
+ ],
+ "size": {
+ "0": 210,
+ "1": 80
+ },
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "a",
+ "type": "INT,FLOAT",
+ "link": 44
+ },
+ {
+ "name": "b",
+ "type": "INT,FLOAT",
+ "link": 45
+ }
+ ],
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 46
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SimpleMath+"
+ },
+ "widgets_values": [
+ "a*b"
+ ]
+ },
+ {
+ "id": 23,
+ "type": "ConsoleDebug+",
+ "pos": [
+ 1920,
+ 780
+ ],
+ "size": {
+ "0": 210,
+ "1": 60
+ },
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "value",
+ "type": "*",
+ "link": 46
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ConsoleDebug+"
+ },
+ "widgets_values": [
+ "Value:"
+ ]
+ },
+ {
+ "id": 2,
+ "type": "ImageResize+",
+ "pos": [
+ 430,
+ 340
+ ],
+ "size": {
+ "0": 310,
+ "1": 170
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 4
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 9
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 44
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 45
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageResize+"
+ },
+ "widgets_values": [
+ 256,
+ 64,
+ "lanczos",
+ true
+ ]
+ },
+ {
+ "id": 4,
+ "type": "ImageFlip+",
+ "pos": [
+ 430,
+ 800
+ ],
+ "size": {
+ "0": 310,
+ "1": 60
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 6
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 11
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageFlip+"
+ },
+ "widgets_values": [
+ "xy"
+ ]
+ },
+ {
+ "id": 6,
+ "type": "ImagePosterize+",
+ "pos": [
+ 430,
+ 1000
+ ],
+ "size": {
+ "0": 310,
+ "1": 60
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 13
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImagePosterize+"
+ },
+ "widgets_values": [
+ 0.5
+ ]
+ },
+ {
+ "id": 27,
+ "type": "ImageCASharpening+",
+ "pos": [
+ 430,
+ 1110
+ ],
+ "size": {
+ "0": 310.79998779296875,
+ "1": 60
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 22
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 23
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageCASharpening+"
+ },
+ "widgets_values": [
+ 0.8
+ ]
+ },
+ {
+ "id": 15,
+ "type": "MaskBlur+",
+ "pos": [
+ 1690,
+ 130
+ ],
+ "size": {
+ "0": 310,
+ "1": 82
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 14
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 19
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskBlur+"
+ },
+ "widgets_values": [
+ 45,
+ 28.5
+ ]
+ },
+ {
+ "id": 16,
+ "type": "MaskFlip+",
+ "pos": [
+ 1690,
+ 270
+ ],
+ "size": {
+ "0": 310,
+ "1": 60
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 15
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 18
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskFlip+"
+ },
+ "widgets_values": [
+ "xy"
+ ]
+ },
+ {
+ "id": 13,
+ "type": "PreviewImage",
+ "pos": [
+ 1100,
+ 760
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 49
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 37,
+ "type": "ImageDesaturate+",
+ "pos": [
+ 500,
+ 920
+ ],
+ "size": {
+ "0": 190,
+ "1": 30
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 48
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 49
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageDesaturate+"
+ }
+ },
+ {
+ "id": 7,
+ "type": "LoadImage",
+ "pos": [
+ -90,
+ 650
+ ],
+ "size": {
+ "0": 315,
+ "1": 314
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 1,
+ 4,
+ 6,
+ 8,
+ 22,
+ 48,
+ 57
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "venere.jpg",
+ "image"
+ ]
+ },
+ {
+ "id": 11,
+ "type": "PreviewImage",
+ "pos": [
+ 1100,
+ 450
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 58
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 40,
+ "type": "ImageCrop+",
+ "pos": [
+ 430,
+ 560
+ ],
+ "size": {
+ "0": 310,
+ "1": 194
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 57
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 58
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "x",
+ "type": "INT",
+ "links": null,
+ "shape": 3
+ },
+ {
+ "name": "y",
+ "type": "INT",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageCrop+"
+ },
+ "widgets_values": [
+ 256,
+ 256,
+ "center",
+ 0,
+ 0
+ ]
+ },
+ {
+ "id": 20,
+ "type": "LoadImageMask",
+ "pos": [
+ 1400,
+ 260
+ ],
+ "size": {
+ "0": 220.70516967773438,
+ "1": 318
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 14,
+ 15
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImageMask"
+ },
+ "widgets_values": [
+ "cwf_inpaint_example_mask.png",
+ "alpha",
+ "image"
+ ]
+ },
+ {
+ "id": 21,
+ "type": "MaskPreview+",
+ "pos": [
+ 2100,
+ 380
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 18
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskPreview+"
+ }
+ }
+ ],
+ "links": [
+ [
+ 1,
+ 7,
+ 0,
+ 1,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 2,
+ 1,
+ 0,
+ 8,
+ 0,
+ "*"
+ ],
+ [
+ 3,
+ 1,
+ 1,
+ 9,
+ 0,
+ "*"
+ ],
+ [
+ 4,
+ 7,
+ 0,
+ 2,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 6,
+ 7,
+ 0,
+ 4,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 8,
+ 7,
+ 0,
+ 6,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 9,
+ 2,
+ 0,
+ 10,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 11,
+ 4,
+ 0,
+ 12,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 13,
+ 6,
+ 0,
+ 14,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 14,
+ 20,
+ 0,
+ 15,
+ 0,
+ "MASK"
+ ],
+ [
+ 15,
+ 20,
+ 0,
+ 16,
+ 0,
+ "MASK"
+ ],
+ [
+ 18,
+ 16,
+ 0,
+ 21,
+ 0,
+ "MASK"
+ ],
+ [
+ 19,
+ 15,
+ 0,
+ 18,
+ 0,
+ "MASK"
+ ],
+ [
+ 22,
+ 7,
+ 0,
+ 27,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 23,
+ 27,
+ 0,
+ 28,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 44,
+ 2,
+ 1,
+ 36,
+ 0,
+ "INT,FLOAT"
+ ],
+ [
+ 45,
+ 2,
+ 2,
+ 36,
+ 1,
+ "INT,FLOAT"
+ ],
+ [
+ 46,
+ 36,
+ 0,
+ 23,
+ 0,
+ "*"
+ ],
+ [
+ 48,
+ 7,
+ 0,
+ 37,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 49,
+ 37,
+ 0,
+ 13,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 57,
+ 7,
+ 0,
+ 40,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 58,
+ 40,
+ 0,
+ 11,
+ 0,
+ "IMAGE"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/__pycache__/websocket_image_save.cpython-312.pyc b/__pycache__/websocket_image_save.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b411aa51a8413537778a0b5b31591620ba17151
Binary files /dev/null and b/__pycache__/websocket_image_save.cpython-312.pyc differ
diff --git a/cg-use-everywhere/.gitattributes b/cg-use-everywhere/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..f13e053bf0ebf99d69b8e28c0f02eb346dcfe15e
--- /dev/null
+++ b/cg-use-everywhere/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/cg-use-everywhere/.github/workflows/publish_action.yml b/cg-use-everywhere/.github/workflows/publish_action.yml
new file mode 100644
index 0000000000000000000000000000000000000000..abda881760b91dcbe7c8cfd5b347f452938537ad
--- /dev/null
+++ b/cg-use-everywhere/.github/workflows/publish_action.yml
@@ -0,0 +1,20 @@
+name: Publish to Comfy registry
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ paths:
+ - "pyproject.toml"
+
+jobs:
+ publish-node:
+ name: Publish Custom Node to registry
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out code
+ uses: actions/checkout@v4
+ - name: Publish Custom Node
+ uses: Comfy-Org/publish-node-action@main
+ with:
+ personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} ## Add your own personal access token to your Github Repository secrets and reference it here.
\ No newline at end of file
diff --git a/cg-use-everywhere/.gitignore b/cg-use-everywhere/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7d3fc697da9414cdb86778980808463c73175870
--- /dev/null
+++ b/cg-use-everywhere/.gitignore
@@ -0,0 +1,155 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.DS_Store
+workflow.pastel.json
+workflow.pfixed.json
diff --git a/cg-use-everywhere/LICENSE b/cg-use-everywhere/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..deeea2d8ccdb1354f351a6ea02ed456849d51422
--- /dev/null
+++ b/cg-use-everywhere/LICENSE
@@ -0,0 +1,201 @@
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/cg-use-everywhere/README.md b/cg-use-everywhere/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d58eb419006c147e1a80060a489552c3dc327c69
--- /dev/null
+++ b/cg-use-everywhere/README.md
@@ -0,0 +1,259 @@
+# UE Nodes
+
+Love this node? [Buy me a coffee!](https://www.buymeacoffee.com/chrisgoringe)
+
+Getting started? Download the test workflow below and see how it works.
+
+Problems? Jump down to [logging and debugging](https://github.com/chrisgoringe/cg-use-everywhere/blob/main/README.md#loggingdebugging)
+
+Ideas for how to improve the nodes (or bug reports) - [raise an issue](https://github.com/chrisgoringe/cg-use-everywhere/issues)
+
+Shameless plug for my other nodes -> Check out [Image Picker](https://github.com/chrisgoringe/cg-image-picker) for another way to make some workflows smoother. And leave a star if you like something!
+
+---
+
+## Test workflow
+
+|This workflow uses all five nodes, and can be used to test (and understand!) the nodes. You wouldn't build it like this, it's just an example...|Here's an image with the workflow in|
+|-|-|
+|![screen](docs/test-workflow-screenshot.png)|![image](docs/test-workflow.png)|
+
+Or [the workflow as json](docs/test-workflow.json)
+
+## Current known limitations
+
+There are some situations that UE nodes can't cope with at present. Here are some I know about, and possible workarounds.
+
+### Pythonssss Preset Text
+
+[pythonsssss](https://github.com/pythongosssss/ComfyUI-Custom-Scripts) custom nodes are great, but there are some limitations in using them with UE nodes. In particular, you can't feed the output of a Preset Text node directly into a UE node (see https://github.com/chrisgoringe/cg-use-everywhere/issues/154).
+
+### Group nodes
+
+UE nodes mostly work with group nodes. But there are a couple of important things to note:
+
+- when you create a group node the input names and node names can change. This might break UE? regex connections.
+
+## Latest updates
+
+5.0 (6th August 2024)
+- Significant change to core logic. Should greatly reduce incompatibility issues, but may have unexpoected consequences!
+- Added to right-click menu on nodes the option to make a node reject all UE connections.
+
+4.9 (2nd May 2024)
+- Fix incompatibility with Efficiency Nodes (#182)
+
+4.8 (18th March 2024)
+- Group and color sending have a `send to unmatched` mode
+- UE link animations can be the classic dots, or a pulsing glow (or both, or neither)
+- Show UE links can now be on, off, mouseover, selected nodes, or mouseover and selected nodes
+
+4.7 (1st March 2024)
+- UE now works in group nodes
+- Autocomplete on `Anything Everywhere?` nodes
+
+4.6
+- add Group Regex to `Anything Everywhere?` node
+- if you have workflow json files saved that now don't work, try 'workflow_fixer.py'
+
+4.5
+- add support for Comfy UI Group Nodes (UE nodes can be used to connect to group node inputs and outputs, but not within a group node)
+- add `convert to real links`
+
+4.4
+- add (limited) support for converting regex in the `Anything Everywhere?` node with inputs (only works if the link is from a node that is a simple string source)
+
+4.3
+- added support for targetting [Highway nodes](https://github.com/chrisgoringe/cg-use-everywhere#highway-nodes)
+
+4.2
+- improved performance of loop detection, especially with [highway nodes](https://github.com/Trung0246/ComfyUI-0246)
+- updated docs to not use other custom nodes in examples
+
+4.1.2
+- tweaks to improve handling of bypass
+- fixed connecting to Seed Everywhere
+
+4.1.1
+- added option to turn animation off
+
+4.1
+
+- added [loop detection](https://github.com/chrisgoringe/cg-use-everywhere#loop-checking)
+- added [group restriction](https://github.com/chrisgoringe/cg-use-everywhere#group-restriction).
+
+The v1 nodes have been fully removed. If you were using one, you can just replace it with an `Anything Everywhere` node.
+
+## Installing
+
+Use Comfy Manager. If you really want to do it manually, just clone this repository in your custom_nodes directory.
+
+## Anything Everywhere (start here!)
+
+The `Anything Everywhere` node has a single input, initially labelled 'anything'. Connect anything to it (directly - not via a reroute), and the input name changes to match the input type. Disconnect and it goes back to 'anything'.
+
+When you run the prompt, any unconnected input, anywhere in the workflow, which matches that type, will act as if it were connected to the same input.
+
+To visualise what it's being connected to, right-click on the background canvas and select `Toggle UE Link Visibility`.
+
+## Anything Everywhere? - control matching with regex rules
+
+This node adds two widgets - title_regex and input_regex. It will only send to inputs which match. So in the example, title_regex is 'Preview' so the image is sent to the Preview Image node but not the Save Image node. Note that you can rename node and input titles, which can help!
+
+(From 4.6 you can also specify a group regex to only match inputs on nodes which are in groups that match the regex.)
+
+![regex](docs/regex.png)
+
+*The matches are regular expressions, not string matches.* Most simple strings will work (matching any part of the title or input name), but some characters have special meanings (including various sorts of brackets, ^, $, /, and . in particular) so just avoid them if you aren't regex-inclined.
+
+Using regex means you can use `^prompt` to match `prompt` at the beginning of the title only, to avoid matching `negative_prompt`.
+
+Regex 101 - `^` means 'the start', `$` means 'the end', `.` matches any single character, `.*` matches anything of any length (including zero). For more than that, visit [regex101](https://regex101.com/) (the flavour you want is ECMAScript, though that probably won't matter).
+
+### Can I make the regex an input instead of a widget?
+
+Sort of.
+
+Because the regex needs to be known before the workflow is submitted (in order to calculate the links), you can't pass a string into the `Anything Everywhere?` node and expect it to work. The *only* thing that is supported is if the input comes *directly* from a node which sets it with a string widget. The `Simple String` node that is included in this pack will work.
+
+|This works|This doesn't. And never will.|
+|-|-|
+|![Alt text](docs/image.png)|![no](docs/imagex.png)|
+
+
+## Seed Everywhere
+
+Seed Everywhere connects to any unconnected INT input with `seed` in the input name (seed, noise_seed, etc), and it has the control_after_generate feature. So if you convert the seed widgets to inputs you can use the same seed everywhere.
+
+## Anything Everywhere3 - One node, three inputs.
+
+Really just three `Anything Everywhere` nodes packaged together. Designed for the outputs of Checkpoint Loader.
+
+![UE3](docs/UE3.png)
+
+## Prompts Everywhere - two strings or conditionings
+
+Prompt Everywhere has two inputs. They will be sent with regex matching rules of `(^prompt|^positive)` and `neg` respectively. These should match the various versions of names that get used for prompts and negative prompts or conditionings.
+
+|strings|conditionings|
+|-|-|
+|![pe](docs/PE.png)|![pe](docs/conditioning.png)
+
+# Primitives and COMBOs and the like
+
+UE nodes don't work with primitives and COMBOs (the data type used for dropdown lists, which are also a type of primitive within Comfy). It's unlikely they ever will.
+
+If you want to use UE to control sampler or sigma, you can do this with the built in `SamplerCustom` nodes:
+
+![sample and sigma](docs/sampler%20and%20sigma.png)
+
+For more on this, see [this discussion](https://github.com/chrisgoringe/cg-use-everywhere/issues/69)
+
+# Other features
+
+## Reject links
+
+Right click on a node and you can set it to reject UE links
+
+## Show links - visualisation and animation.
+
+If you want to see the UE links, you can turn them on and off by right-clicking on the canvas. For finer control, the main settings menu has options to show links when the mouse moves over the node at either end, or when one of those nodes is selected.
+
+The links can be animated to distinguish them from normal links - this animation can take the form of moving dots, a pulsing glow, or both. This may impact performance in some cases - note that the pulse animation requires less processing than the moving dots. Control this in the main settings menu.
+
+By default the animations turn off when the workflow is running to minimise impact on CPU/GPU - you can change this in the settings too.
+
+## Convert to real links
+
+If you want to share a workflow without UE nodes being required, or to save an API version of a workflow, you can replace the virtual links created by UE nodes with real links (and remove the UE nodes).
+
+This can be done for a single node by right-clicking on it and selecting `Convert to real links`, or for all UE nodes in a workflow by right-clicking the background and selecting `Convert all UEs to real links`.
+
+## Shift drag
+
+Shift click on an output node and drag then release to get an autocreate menu. This replaces the default behaviour (which gives you a search box), so you can disable it with the `Anything Everywhere replace search` setting.
+
+![auto](docs/auto.gif)
+
+## Group and color restriction
+
+UE nodes can be restricted to send only to nodes of the same color, or only to nodes that *aren't* the same color.
+
+They can also be restricted to send only to nodes in the same group (any group in common), or only to nodes that aren't in the same group.
+
+Right-click on the node and select `Group restrictions` or `Color restrictions`. UE nodes which are restricted (in either or both ways) have a green circle in the top-left corner.
+
+## Highway nodes
+
+Trung 0246's [Highway nodes](https://github.com/Trung0246/ComfyUI-0246) are a pretty cool way of piping data around. You can target them with an `Anything Everywhere?` node by using an `input_regex` which matches the unconnected input name with the '+', like this:
+![highway](docs/highway.png)
+
+This is new, so please report any issues!
+
+## Loop checking
+
+By default workflows are checked for loops before they are submitted (because UE can introduce them, and a loop results in a bad python outcome). If a loop is detected you'll get a JavaScript warning showing you the node ids involved. However, especially if there are other custom nodes involved, it's possible that the check will miss a loop, or flag one that isn't real.
+
+If you get a warning and don't believe there is a loop (having checked the node ids listed!) you can turn loop checking off in the main settings menu. If something flagged as a loop runs fine, please [raise an issue](https://github.com/chrisgoringe/cg-use-everywhere/issues) and include the workflow in the report (save the json and zip it, because GitHub doesn't accept .json files). Likewise if a loop doesn't get caught.
+
+I've written code for the core Comfy backend to catch loops, maybe it'll be included - [PR for ComfyUI](https://github.com/comfyanonymous/ComfyUI/pull/1652) - or maybe they have another plan.
+
+## Priorities
+
+If there is more than one sending node that matches an input, the basic rules is that the more specific node wins. The order of priorities is:
+
+- `Anything Everywhere?`
+- `Seed Everywhere` and `Prompts Everywhere`
+- `Anything Everywhere`
+- `Anything Everywhere3`
+
+For nodes of the same time, those with colour restrictions and group restriction are prioritised (colour+group > colour > group > none).
+
+If two nodes with the same priority both match *neither will connect* - better to fail fast than have an ambiguous outcome. If there are ambiguous matches you can display them using `Show UE broadcast clashes` (right-click on background - the option only appears if there are clashes).
+
+## See what is sent
+
+The nodes which only have one output can also gain a text box showing exactly what passed through the node. You need to turn this on if you want it - it's in the main settings, 'Anything Everywhere node details'.
+
+## Logging/Debugging
+
+The JavaScript console (press f12 in some browsers) has logging information about what is being connected. You can change the level of detail by finding the file `[comfy_install]/custom_nodes/cg-use-everywhere/js/use_everywhre_utilities.js` and near the top finding this bit:
+```javascript
+ static ERROR = 0; // actual errors
+ static PROBLEM = 1; // things that stop the workflow working
+ static INFORMATION = 2; // record of good things
+ static DETAIL = 3; // details
+
+ static LEVEL = Logger.PROBLEM;
+ static TRACE = false; // most of the method calls
+```
+Change the `LEVEL` to `Logger.INFORMATION` for more, or `Logger.DETAIL` for even more; set `TRACE` to `true` for some other debugging information.
+
+If you have a problem, pressing f12 to see the JavaScript console can often help. The following steps are really helpful in making a good bug report:
+
+- update to the latest version
+- restart ComfyUI
+- clear the canvas
+- close the browser
+- open a new Comfy window (with no workflow), look in console (f12) to see if there were any errors as ComfyUI started up
+- load your workflow, and look again
+- run, and look again
+
+The other thing worth trying is clearing out all the custom node javascript from where it gets copied when ComfyUI starts:
+
+- stop Comfy
+- go to [comfy root]/web/extensions (*not* under custom_nodes)
+- remove everything there EXCEPT for `core`. Leave `core` (it's ComfyUI stuff)
+- restart Comfy (all custom nodes will reinstall their javascript at startup)
+
+If you find a bug, please [raise an issue](https://github.com/chrisgoringe/cg-use-everywhere/issues) - if you can include the workflow, that's a huge help (you'll need to save it as .txt, or zip the .json file, because GitHub doesn't accept .json).
+
+## Cautions
+
+Bypassing and disabling nodes works, but with one catch. If you have a UE nodes that does matching (`Anything Everywhere?` and `Prompt Everywhere`) and you bypass the node it matches to, the link won't be made. So
+
+|If you use a ? node to send to a node...|...and bypass the recipient, it doesn't get connected |
+|-|-|
+|![1](docs/bypass_catch1.png)|![2](docs/bypass_catch2.png)|
+
+This is unlikely to be fixed, but should be fairly easy to avoid!
diff --git a/cg-use-everywhere/__init__.py b/cg-use-everywhere/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c5afa1d7cae7bb4ef5359bf566e6180f2b6e2bf
--- /dev/null
+++ b/cg-use-everywhere/__init__.py
@@ -0,0 +1,30 @@
+from .use_everywhere import SeedEverywhere, AnythingEverywherePrompts
+
+UE_VERSION = "5.0.6"
+
+NODE_CLASS_MAPPINGS = { "Seed Everywhere": SeedEverywhere }
+
+from .use_everywhere import AnythingEverywhere, AnythingSomewhere, AnythingEverywhereTriplet, SimpleString
+NODE_CLASS_MAPPINGS["Anything Everywhere"] = AnythingEverywhere
+NODE_CLASS_MAPPINGS["Anything Everywhere3"] = AnythingEverywhereTriplet
+NODE_CLASS_MAPPINGS["Anything Everywhere?"] = AnythingSomewhere
+NODE_CLASS_MAPPINGS["Prompts Everywhere"] = AnythingEverywherePrompts
+NODE_CLASS_MAPPINGS["Simple String"] = SimpleString
+
+import os, shutil
+import folder_paths
+
+# temporary code to remove old javascript installs
+module_js_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "js")
+application_root_directory = os.path.dirname(folder_paths.__file__)
+old_code_location = os.path.join(application_root_directory, "web", "extensions", "use_everywhere")
+if os.path.exists(old_code_location):
+ shutil.rmtree(old_code_location)
+
+old_code_location = os.path.join(application_root_directory, "web", "extensions", "cg-nodes", "use_everywhere.js")
+if os.path.exists(old_code_location):
+ os.remove(old_code_location)
+# end of temporary code
+
+WEB_DIRECTORY = "./js"
+__all__ = ["NODE_CLASS_MAPPINGS", "WEB_DIRECTORY"]
diff --git a/cg-use-everywhere/__pycache__/__init__.cpython-312.pyc b/cg-use-everywhere/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..142257e6c1080bbf992e13089c1f5e378caeb0ea
Binary files /dev/null and b/cg-use-everywhere/__pycache__/__init__.cpython-312.pyc differ
diff --git a/cg-use-everywhere/__pycache__/use_everywhere.cpython-312.pyc b/cg-use-everywhere/__pycache__/use_everywhere.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6afd6b40deb60283b5ff9f8c7691dd4e88d943c2
Binary files /dev/null and b/cg-use-everywhere/__pycache__/use_everywhere.cpython-312.pyc differ
diff --git a/cg-use-everywhere/docs/ComfyUI_temp_zbfdv_00012_.png b/cg-use-everywhere/docs/ComfyUI_temp_zbfdv_00012_.png
new file mode 100644
index 0000000000000000000000000000000000000000..cecee2f81404336daea74a67dd58a9716bbe7507
Binary files /dev/null and b/cg-use-everywhere/docs/ComfyUI_temp_zbfdv_00012_.png differ
diff --git a/cg-use-everywhere/docs/PE.png b/cg-use-everywhere/docs/PE.png
new file mode 100644
index 0000000000000000000000000000000000000000..73b200e98ebb44a815e6a9df1a793065831639fe
Binary files /dev/null and b/cg-use-everywhere/docs/PE.png differ
diff --git a/cg-use-everywhere/docs/UE3.png b/cg-use-everywhere/docs/UE3.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a02f0299638794c063611cc3cfef2c7e941a92d
Binary files /dev/null and b/cg-use-everywhere/docs/UE3.png differ
diff --git a/cg-use-everywhere/docs/UEQ.png b/cg-use-everywhere/docs/UEQ.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f418043cfc1402cbfebaa2081c34e301d44d887
Binary files /dev/null and b/cg-use-everywhere/docs/UEQ.png differ
diff --git a/cg-use-everywhere/docs/UEQportrait.png b/cg-use-everywhere/docs/UEQportrait.png
new file mode 100644
index 0000000000000000000000000000000000000000..29ee87ba6e7f2c8a73e9ea2eea5ec03a74e2e0fc
Binary files /dev/null and b/cg-use-everywhere/docs/UEQportrait.png differ
diff --git a/cg-use-everywhere/docs/auto.gif b/cg-use-everywhere/docs/auto.gif
new file mode 100644
index 0000000000000000000000000000000000000000..83fe03d493e3f2b45ee5cac2ab84d595ab3234e3
Binary files /dev/null and b/cg-use-everywhere/docs/auto.gif differ
diff --git a/cg-use-everywhere/docs/bypass_catch1.png b/cg-use-everywhere/docs/bypass_catch1.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0a1c5b9b1bc7f1c844267c3ecdf18168eed17a1
Binary files /dev/null and b/cg-use-everywhere/docs/bypass_catch1.png differ
diff --git a/cg-use-everywhere/docs/bypass_catch2.png b/cg-use-everywhere/docs/bypass_catch2.png
new file mode 100644
index 0000000000000000000000000000000000000000..526f500fb52ceffe3aa2f523281c601a4f090a0f
Binary files /dev/null and b/cg-use-everywhere/docs/bypass_catch2.png differ
diff --git a/cg-use-everywhere/docs/clashes.png b/cg-use-everywhere/docs/clashes.png
new file mode 100644
index 0000000000000000000000000000000000000000..150d749dbded4fb2c179f792993c99d755d69f3d
Binary files /dev/null and b/cg-use-everywhere/docs/clashes.png differ
diff --git a/cg-use-everywhere/docs/conditioning.png b/cg-use-everywhere/docs/conditioning.png
new file mode 100644
index 0000000000000000000000000000000000000000..9fe8e19f094fdf7d5f2527b045f5a2c9cdb63498
Binary files /dev/null and b/cg-use-everywhere/docs/conditioning.png differ
diff --git a/cg-use-everywhere/docs/connected.png b/cg-use-everywhere/docs/connected.png
new file mode 100644
index 0000000000000000000000000000000000000000..c2188a8bd53feb2b2e3c2e635c00af8e8988ffe1
Binary files /dev/null and b/cg-use-everywhere/docs/connected.png differ
diff --git a/cg-use-everywhere/docs/connection-ui.png b/cg-use-everywhere/docs/connection-ui.png
new file mode 100644
index 0000000000000000000000000000000000000000..7231ef9d70146e80359373c74a0b713c3fd26859
Binary files /dev/null and b/cg-use-everywhere/docs/connection-ui.png differ
diff --git a/cg-use-everywhere/docs/deprecated.md b/cg-use-everywhere/docs/deprecated.md
new file mode 100644
index 0000000000000000000000000000000000000000..92a06462be4ec02101704388cd40f859106180ed
--- /dev/null
+++ b/cg-use-everywhere/docs/deprecated.md
@@ -0,0 +1,33 @@
+
+# Deprecated Nodes
+
+This is the old documentation, in case you have a workflow still using the deprecated nodes.
+
+
+UE nodes are "Use Everywhere". Put a UE node into your workflow, connect its input, and every node with an unconnected input of the same type will act as if connected to it.
+
+CLIP, IMAGE, MODEL, VAE, CONDITIONING, or LATENT (want something else? Edit `__init__.py` line 3.)
+
+Update: added INT, MASK, and CHECKPOIMNT - which combines MODEL, CLIP, and VAE, and a special node for SEEDs.
+
+| Model, clip, vae, latent and image are all being automagically connected. | Drop this image into ComfyUI to get a working workflow. |
+|-|-|
+|![workflow](./workflow.png)|![portrait](./portrait.png)|
+
+## UE? Nodes
+
+UE? nodes are like UE Nodes, but add two widgets, 'title' and 'input'. These are Regular Expressions, and the node will only send to nodes where the node Title and the unconnected input name match.
+
+It doesn't need to be a complete match - the logic is `regex.match(name) || regex.match(title)`, so if you want to match the exact name `seed`, you'll need something like `^seed$` as your regex.
+
+Regex 101 - ^ means 'the start', $ means 'the end', '.' matches anything, '.*' matches any number of anything. For more than that, visit [regex101](https://regex101.com/) (the flavour you want is ECMAScript, though that probably won't matter).
+
+| So you can do things like: | Drop this image into ComfyUI to get a working workflow. |
+|-|-|
+|![this](./UEQ.png)|![drop](./UEQportrait.png)|
+
+## Widget?
+
+A UE or UE? node with just one output can have the output converted to a widget. But the combination ones can't. Also note that if you convert it to a widget, you can't then change the title
+
+Why not? because the code gets the data type from the input (weirdly the prompt doesn't contain the data type on outputs), and it's not available if it's a widget, because reasons, so the hack is to get the data type from what comes after `UE ` in the title...
diff --git a/cg-use-everywhere/docs/group.png b/cg-use-everywhere/docs/group.png
new file mode 100644
index 0000000000000000000000000000000000000000..268a23e4b672fea148fc9346325fa20016eb7bd9
Binary files /dev/null and b/cg-use-everywhere/docs/group.png differ
diff --git a/cg-use-everywhere/docs/highway.png b/cg-use-everywhere/docs/highway.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba907c5bf311c997b413699423955cf42b6ed3f1
Binary files /dev/null and b/cg-use-everywhere/docs/highway.png differ
diff --git a/cg-use-everywhere/docs/image.png b/cg-use-everywhere/docs/image.png
new file mode 100644
index 0000000000000000000000000000000000000000..c517527837cf95f5ecca76a587e8e4719b5417be
Binary files /dev/null and b/cg-use-everywhere/docs/image.png differ
diff --git a/cg-use-everywhere/docs/imagex.png b/cg-use-everywhere/docs/imagex.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e6d230d5a4b1a18ca4e395f0f3f45f6953258c2
Binary files /dev/null and b/cg-use-everywhere/docs/imagex.png differ
diff --git a/cg-use-everywhere/docs/mouseOver.gif b/cg-use-everywhere/docs/mouseOver.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e722be47fe72be1af321614987f573caf6c28127
Binary files /dev/null and b/cg-use-everywhere/docs/mouseOver.gif differ
diff --git a/cg-use-everywhere/docs/off.png b/cg-use-everywhere/docs/off.png
new file mode 100644
index 0000000000000000000000000000000000000000..aec0f52155972341fe1f3f6bfadc101a9eebd872
Binary files /dev/null and b/cg-use-everywhere/docs/off.png differ
diff --git a/cg-use-everywhere/docs/on.png b/cg-use-everywhere/docs/on.png
new file mode 100644
index 0000000000000000000000000000000000000000..40eecc25ceb41110f1be8ed76f31b81c039d89fa
Binary files /dev/null and b/cg-use-everywhere/docs/on.png differ
diff --git a/cg-use-everywhere/docs/portrait.png b/cg-use-everywhere/docs/portrait.png
new file mode 100644
index 0000000000000000000000000000000000000000..74ab43c92402bbdee01a9600a36f7d878ef63d4e
Binary files /dev/null and b/cg-use-everywhere/docs/portrait.png differ
diff --git a/cg-use-everywhere/docs/priority.gif b/cg-use-everywhere/docs/priority.gif
new file mode 100644
index 0000000000000000000000000000000000000000..8ba2e192fab9df475beeb971d38ead37da5aa88f
Binary files /dev/null and b/cg-use-everywhere/docs/priority.gif differ
diff --git a/cg-use-everywhere/docs/regex.png b/cg-use-everywhere/docs/regex.png
new file mode 100644
index 0000000000000000000000000000000000000000..2001d10b06043709a3baea0d70f9f8598a2d9c60
Binary files /dev/null and b/cg-use-everywhere/docs/regex.png differ
diff --git a/cg-use-everywhere/docs/run.png b/cg-use-everywhere/docs/run.png
new file mode 100644
index 0000000000000000000000000000000000000000..c741d44b8fa774063779e92e50de100a319b61ff
Binary files /dev/null and b/cg-use-everywhere/docs/run.png differ
diff --git a/cg-use-everywhere/docs/sampler and sigma.png b/cg-use-everywhere/docs/sampler and sigma.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7a8e4c5a80dbe0da3f34c446f8ff1d04bd3dc43
Binary files /dev/null and b/cg-use-everywhere/docs/sampler and sigma.png differ
diff --git a/cg-use-everywhere/docs/separate.png b/cg-use-everywhere/docs/separate.png
new file mode 100644
index 0000000000000000000000000000000000000000..717062f66fde9e5bef8b84302e2761d3637b0074
Binary files /dev/null and b/cg-use-everywhere/docs/separate.png differ
diff --git a/cg-use-everywhere/docs/test-workflow-screenshot.png b/cg-use-everywhere/docs/test-workflow-screenshot.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0d4547c0699e69035e5dab1fe2d2b31410bb1aa
Binary files /dev/null and b/cg-use-everywhere/docs/test-workflow-screenshot.png differ
diff --git a/cg-use-everywhere/docs/test-workflow.json b/cg-use-everywhere/docs/test-workflow.json
new file mode 100644
index 0000000000000000000000000000000000000000..eac87d7c2f27f1d49afe05ae588e02176840d497
--- /dev/null
+++ b/cg-use-everywhere/docs/test-workflow.json
@@ -0,0 +1,775 @@
+{
+ "last_node_id": 185,
+ "last_link_id": 555,
+ "nodes": [
+ {
+ "id": 144,
+ "type": "PreviewImage",
+ "pos": [
+ 928,
+ -39
+ ],
+ "size": {
+ "0": 430.8935546875,
+ "1": 533.0433349609375
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 180,
+ "type": "Prompts Everywhere",
+ "pos": [
+ 1189,
+ -256
+ ],
+ "size": {
+ "0": 177.46200561523438,
+ "1": 46
+ },
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "*",
+ "link": 535,
+ "color_on": "#FFA931"
+ },
+ {
+ "name": "CONDITIONING",
+ "type": "*",
+ "link": 536,
+ "color_on": "#FFA931"
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Prompts Everywhere",
+ "group_restricted": false
+ }
+ },
+ {
+ "id": 148,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -356,
+ -204
+ ],
+ "size": {
+ "0": 308.89697265625,
+ "1": 98
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 540
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 542
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 539
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "copaxVividXL_v2.safetensors"
+ ]
+ },
+ {
+ "id": 181,
+ "type": "Anything Everywhere3",
+ "pos": [
+ 332,
+ -204
+ ],
+ "size": {
+ "0": 210,
+ "1": 66
+ },
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "MODEL",
+ "type": "*",
+ "link": 541,
+ "color_on": "#B39DDB"
+ },
+ {
+ "name": "CLIP",
+ "type": "*",
+ "link": 543,
+ "color_on": "#FFD500"
+ },
+ {
+ "name": "VAE",
+ "type": "*",
+ "link": 539,
+ "color_on": "#FF6E6E"
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Anything Everywhere3",
+ "group_restricted": false
+ }
+ },
+ {
+ "id": 178,
+ "type": "EmptyLatentImage",
+ "pos": [
+ -350,
+ 1
+ ],
+ "size": {
+ "0": 269.2752990722656,
+ "1": 106
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 544
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 768,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 183,
+ "type": "Anything Everywhere",
+ "pos": [
+ -316,
+ 179
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "LATENT",
+ "type": "*",
+ "link": 544,
+ "color_on": "#FF9CF9"
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Anything Everywhere",
+ "group_restricted": false
+ }
+ },
+ {
+ "id": 5,
+ "type": "KSampler",
+ "pos": [
+ 51,
+ -1
+ ],
+ "size": {
+ "0": 260.72747802734375,
+ "1": 249.28138732910156
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": null
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": null
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": null
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "link": null,
+ "widget": {
+ "name": "seed"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 545
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1125899906842624,
+ "increment",
+ 35,
+ 8,
+ "dpmpp_3m_sde",
+ "karras",
+ 1
+ ],
+ "color": "#57571a",
+ "bgcolor": "#6b6b2e"
+ },
+ {
+ "id": 184,
+ "type": "Anything Everywhere?",
+ "pos": [
+ 339,
+ 0
+ ],
+ "size": {
+ "0": 210,
+ "1": 82
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "LATENT",
+ "type": "*",
+ "link": 545,
+ "color_on": "#FF9CF9"
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Anything Everywhere?",
+ "group_restricted": false
+ },
+ "widgets_values": [
+ ".*",
+ "samples"
+ ]
+ },
+ {
+ "id": 179,
+ "type": "Anything Everywhere",
+ "pos": [
+ 624,
+ 185
+ ],
+ "size": {
+ "0": 181.96005249023438,
+ "1": 26
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "IMAGE",
+ "type": "*",
+ "link": 534,
+ "color_on": "#64B5F6"
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Anything Everywhere",
+ "group_restricted": false
+ }
+ },
+ {
+ "id": 7,
+ "type": "VAEDecode",
+ "pos": [
+ 637,
+ 74
+ ],
+ "size": {
+ "0": 140,
+ "1": 46
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": null
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 534
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ },
+ "color": "#2e571a",
+ "bgcolor": "#426b2e"
+ },
+ {
+ "id": 182,
+ "type": "LoraLoader",
+ "pos": [
+ 15,
+ -290
+ ],
+ "size": {
+ "0": 273.7867126464844,
+ "1": 126
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 540
+ },
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 542
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 541
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 543
+ ],
+ "shape": 3,
+ "slot_index": 1
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoraLoader"
+ },
+ "widgets_values": [
+ "sd_xl_offset_example-lora_1.0.safetensors",
+ 1,
+ 1
+ ]
+ },
+ {
+ "id": 185,
+ "type": "Note",
+ "pos": [
+ 396,
+ 326
+ ],
+ "size": {
+ "0": 437.6109619140625,
+ "1": 131.43035888671875
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "properties": {
+ "text": ""
+ },
+ "widgets_values": [
+ "This workflow uses all the UE nodes, and can also test bypass (load LoRA)"
+ ],
+ "color": "#432",
+ "bgcolor": "#653"
+ },
+ {
+ "id": 169,
+ "type": "Seed Everywhere",
+ "pos": [
+ 81,
+ 345
+ ],
+ "size": {
+ "0": 210,
+ "1": 82
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": null,
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Seed Everywhere",
+ "group_restricted": false
+ },
+ "widgets_values": [
+ 356735678581,
+ "fixed"
+ ]
+ },
+ {
+ "id": 162,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 599,
+ -303
+ ],
+ "size": {
+ "0": 247.4329071044922,
+ "1": 96
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 535
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "atmospheric photo of woman at night"
+ ]
+ },
+ {
+ "id": 163,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 873,
+ -210
+ ],
+ "size": {
+ "0": 247.4329071044922,
+ "1": 96
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 536
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "blue"
+ ]
+ }
+ ],
+ "links": [
+ [
+ 534,
+ 7,
+ 0,
+ 179,
+ 0,
+ "*"
+ ],
+ [
+ 535,
+ 162,
+ 0,
+ 180,
+ 0,
+ "*"
+ ],
+ [
+ 536,
+ 163,
+ 0,
+ 180,
+ 1,
+ "*"
+ ],
+ [
+ 539,
+ 148,
+ 2,
+ 181,
+ 2,
+ "*"
+ ],
+ [
+ 540,
+ 148,
+ 0,
+ 182,
+ 0,
+ "MODEL"
+ ],
+ [
+ 541,
+ 182,
+ 0,
+ 181,
+ 0,
+ "*"
+ ],
+ [
+ 542,
+ 148,
+ 1,
+ 182,
+ 1,
+ "CLIP"
+ ],
+ [
+ 543,
+ 182,
+ 1,
+ 181,
+ 1,
+ "*"
+ ],
+ [
+ 544,
+ 178,
+ 0,
+ 183,
+ 0,
+ "*"
+ ],
+ [
+ 545,
+ 5,
+ 0,
+ 184,
+ 0,
+ "*"
+ ],
+ [
+ 546,
+ 7,
+ 0,
+ 144,
+ 0,
+ "IMAGE"
+ ],
+ [
+ 547,
+ 182,
+ 0,
+ 5,
+ 0,
+ "MODEL"
+ ],
+ [
+ 548,
+ 162,
+ 0,
+ 5,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 549,
+ 163,
+ 0,
+ 5,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 550,
+ 178,
+ 0,
+ 5,
+ 3,
+ "LATENT"
+ ],
+ [
+ 551,
+ 169,
+ 0,
+ 5,
+ 4,
+ "INT"
+ ],
+ [
+ 552,
+ 5,
+ 0,
+ 7,
+ 0,
+ "LATENT"
+ ],
+ [
+ 553,
+ 148,
+ 2,
+ 7,
+ 1,
+ "VAE"
+ ],
+ [
+ 554,
+ 182,
+ 1,
+ 162,
+ 0,
+ "CLIP"
+ ],
+ [
+ 555,
+ 182,
+ 1,
+ 163,
+ 0,
+ "CLIP"
+ ]
+ ],
+ "groups": [
+ {
+ "title": "Model",
+ "bounding": [
+ -371,
+ -387,
+ 926,
+ 294
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Conditioning",
+ "bounding": [
+ 571,
+ -391,
+ 836,
+ 294
+ ],
+ "color": "#a1309b",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Sampling",
+ "bounding": [
+ -372,
+ -74,
+ 1198,
+ 343
+ ],
+ "color": "#b06634",
+ "font_size": 24,
+ "locked": false
+ }
+ ],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/cg-use-everywhere/docs/test-workflow.png b/cg-use-everywhere/docs/test-workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..81f067dc5d58d1118806b48dd7f8ef7bd3872cba
Binary files /dev/null and b/cg-use-everywhere/docs/test-workflow.png differ
diff --git a/cg-use-everywhere/docs/unconnected.png b/cg-use-everywhere/docs/unconnected.png
new file mode 100644
index 0000000000000000000000000000000000000000..525ef8d4b8f050a31134ced4f73b407f56295e4d
Binary files /dev/null and b/cg-use-everywhere/docs/unconnected.png differ
diff --git a/cg-use-everywhere/docs/workflow.png b/cg-use-everywhere/docs/workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..e7005fd88cf38c466a9af327df8dd0db53204c1d
Binary files /dev/null and b/cg-use-everywhere/docs/workflow.png differ
diff --git a/cg-use-everywhere/js/ue.css b/cg-use-everywhere/js/ue.css
new file mode 100644
index 0000000000000000000000000000000000000000..c69c9548769f1d87be28cb27b8a6031cdb69edcf
--- /dev/null
+++ b/cg-use-everywhere/js/ue.css
@@ -0,0 +1,43 @@
+
+.litegraph label.ueprompt {
+ padding:0px;
+ font-size: 12px;
+ border-radius: 0;
+ border: 0;
+ box-shadow: none !important;
+ margin:0px;
+ height:30px;
+ background-color: rgba(0,0,0,0);
+}
+
+.litegraph span.ueprompttext {
+ margin: 0px;
+ min-width: 30px;
+ font-family: Arial, sans-serif;
+ color: var(--descrip-text);
+ text-align: right;
+ padding: 2px 2px 4px 0px;
+ background-color: inherit;
+}
+
+.litegraph span.uepromptspan {
+ margin: 0px;
+ width: 100%;
+ padding-left:12px;
+ background-color: inherit;
+}
+
+.litegraph input.uepromptinput {
+ padding: 0 0 0 6px;
+ font-size: 12px;
+ border-radius: 12px;
+ border: 2px solid var(--border-color);
+ color: var(--input-text);
+ margin: 0px;
+ width: 100%;
+ float: right;
+}
+
+ue_hide {
+ display: none;
+}
\ No newline at end of file
diff --git a/cg-use-everywhere/js/ue_debug.js b/cg-use-everywhere/js/ue_debug.js
new file mode 100644
index 0000000000000000000000000000000000000000..9be2e08117603b32c6dce35a1fd22940a7ed062e
--- /dev/null
+++ b/cg-use-everywhere/js/ue_debug.js
@@ -0,0 +1,21 @@
+import { app } from "../../scripts/app.js";
+import { defineProperty } from "./use_everywhere_utilities.js";
+
+/*
+Things that can be useful (generally as breakpoints) when debugging
+*/
+export function add_debug() {
+ var dirty_canvas = true;
+ defineProperty(app.canvas, 'dirty_canvas', {
+ get : () => { return dirty_canvas },
+ set : (v) => { dirty_canvas = v;} // a breakpoint here catches the calls that mark the canvas as dirty
+ })
+
+ var dirty_bg_canvas = true;
+ defineProperty(app.canvas, 'dirty_bg_canvas', {
+ get : () => { return dirty_bg_canvas },
+ set : (v) => { dirty_bg_canvas = v;} // a breakpoint here catches the calls that mark the background canvas as dirty
+ })
+}
+
+export const version = 500006
\ No newline at end of file
diff --git a/cg-use-everywhere/js/use_everywhere.js b/cg-use-everywhere/js/use_everywhere.js
new file mode 100644
index 0000000000000000000000000000000000000000..011f2cdf0d73ba36f42b9e6dac88916a41d486a2
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere.js
@@ -0,0 +1,291 @@
+import { app } from "../../scripts/app.js";
+import { api } from "../../scripts/api.js";
+
+import { is_UEnode, is_helper, inject, Logger, get_real_node, defineProperty } from "./use_everywhere_utilities.js";
+import { displayMessage, update_input_label, indicate_restriction, UpdateBlocker } from "./use_everywhere_ui.js";
+import { LinkRenderController } from "./use_everywhere_ui.js";
+import { autoCreateMenu } from "./use_everywhere_autocreate.js";
+import { add_autoprompts } from "./use_everywhere_autoprompt.js";
+import { GraphAnalyser } from "./use_everywhere_graph_analysis.js";
+import { main_menu_settings, node_menu_settings, canvas_menu_settings, non_ue_menu_settings } from "./use_everywhere_settings.js";
+import { add_debug } from "./ue_debug.js";
+
+/*
+The ui component that looks after the link rendering
+*/
+var linkRenderController;
+var graphAnalyser;
+
+/*
+Inject a call to linkRenderController.mark_list_link_outdated into a method with name methodname on all objects in the array
+If object is undefined, do nothing.
+The injection is added at the end of the existing method (if the method didn't exist, it is created).
+A Logger.trace call is added at the start with 'tracetext'
+*/
+function inject_outdating_into_objects(array, methodname, tracetext) {
+ if (array) {
+ array.forEach((object) => { inject_outdating_into_object_method(object, methodname, tracetext); })
+ }
+}
+function inject_outdating_into_object_method(object, methodname, tracetext) {
+ if (object) inject(object, methodname, tracetext, linkRenderController.mark_link_list_outdated, linkRenderController);
+}
+
+app.registerExtension({
+ name: "cg.customnodes.use_everywhere",
+
+ async beforeRegisterNodeDef(nodeType, nodeData, app) {
+ /*
+ When a node is connected or unconnected, the link list is dirty.
+ If it is a UE node, we need to update it as well
+ */
+ const onConnectionsChange = nodeType.prototype.onConnectionsChange;
+ nodeType.prototype.onConnectionsChange = function (side,slot,connect,link_info,output) {
+ Logger.trace("onConnectionsChange", arguments, this);
+ if (this.IS_UE && side==1) { // side 1 is input
+ if (this.type=="Anything Everywhere?" && slot!=0) {
+ // don't do anything for the regexs
+ } else {
+ const type = (connect && link_info) ? get_real_node(link_info?.origin_id)?.outputs[link_info?.origin_slot]?.type : undefined;
+ this.input_type[slot] = type;
+ if (link_info) link_info.type = type ? type : "*";
+ update_input_label(this, slot, app);
+ }
+ }
+ linkRenderController.mark_link_list_outdated();
+ onConnectionsChange?.apply(this, arguments);
+ };
+
+ /*
+ Extra menu options are the node right click menu.
+ We add to this list, and also insert a link list outdate to everything.
+ */
+ const getExtraMenuOptions = nodeType.prototype.getExtraMenuOptions;
+ nodeType.prototype.getExtraMenuOptions = function(_, options) {
+ Logger.trace("getExtraMenuOptions", arguments, this);
+ getExtraMenuOptions?.apply(this, arguments);
+ if (is_UEnode(this)) {
+ node_menu_settings(options, this);
+ } else {
+ non_ue_menu_settings(options, this);
+ }
+ inject_outdating_into_objects(options,'callback',`menu option on ${this.id}`);
+ }
+
+ /*
+ When a UE node is created, we set the group and color restriction properties.
+ We also create pseudo-widgets for all the inputs so that they can be searched
+ and to avoid other code throwing errors.
+ */
+ if (is_UEnode(nodeType)) {
+ const onNodeCreated = nodeType.prototype.onNodeCreated;
+ nodeType.prototype.onNodeCreated = function () {
+ const r = onNodeCreated ? onNodeCreated.apply(this, arguments) : undefined;
+ if (!this.properties) this.properties = {}
+ this.properties.group_restricted = 0;
+ this.properties.color_restricted = 0;
+ if (this.inputs) {
+ if (!this.widgets) this.widgets = [];
+ for (const input of this.inputs) {
+ if (input.widget && !this.widgets.find((w) => w.name === input.widget.name)) this.widgets.push(input.widget)
+ }
+ }
+ return r;
+ }
+ }
+ },
+
+ async nodeCreated(node) {
+ if (!node.__mode) {
+ node.__mode = node.mode
+ defineProperty(node, "mode", {
+ get: ( )=>{return node.__mode},
+ set: (v)=>{node.__mode = v; node.afterChangeMade?.('mode', v);}
+ })
+ }
+ if (!node.__bgcolor) {
+ node.__bgcolor = node.bgcolor
+ defineProperty(node,"bgcolor", {
+ get: ( )=>{return node.__bgcolor},
+ set: (v)=>{node.__bgcolor = v; node.afterChangeMade?.('bgcolor', v);}
+ })
+ }
+ const acm = node.afterChangeMade
+ node.afterChangeMade = (p, v) => {
+ acm?.(p,v)
+ if (p==='bgcolor') {
+ if (node.mode!=4) linkRenderController.mark_link_list_outdated();
+ }
+ if (p==='mode') {
+ linkRenderController.mark_link_list_outdated();
+ node.widgets?.forEach((widget) => {widget.onModeChange?.(v)});
+ }
+ }
+
+ node.IS_UE = is_UEnode(node);
+ if (node.IS_UE) {
+ node.input_type = [undefined, undefined, undefined]; // for dynamic input types
+ node.displayMessage = displayMessage; // receive messages from the python code
+
+ // If a widget on a UE node is edited, link list is dirty
+ inject_outdating_into_objects(node.widgets,'callback',`widget callback on ${node.id}`);
+
+ // draw the indication of group restrictions
+ const original_onDrawTitleBar = node.onDrawTitleBar;
+ node.onDrawTitleBar = function(ctx, title_height) {
+ original_onDrawTitleBar?.apply(this, arguments);
+ if (node.properties.group_restricted || node.properties.color_restricted) indicate_restriction(ctx, title_height);
+ }
+ }
+
+ if (is_helper(node)) { // editing a helper node makes the list dirty
+ inject_outdating_into_objects(node.widgets,'callback',`widget callback on ${this.id}`);
+ }
+
+ // removing a node makes the list dirty
+ inject_outdating_into_object_method(node, 'onRemoved', `node ${node.id} removed`)
+
+ // creating a node makes the link list dirty - but give the system a moment to finish
+ setTimeout( ()=>{linkRenderController.mark_link_list_outdated()}, 100 );
+ },
+
+ // When a graph node is loaded collapsed the UI need to know
+ // probably not needed now autocomplete is gone?
+ loadedGraphNode(node) { if (node.flags.collapsed && node.loaded_when_collapsed) node.loaded_when_collapsed(); },
+
+ async setup() {
+ /*
+ Add css for the autocomplete. Probably not needed now
+ */
+ const head = document.getElementsByTagName('HEAD')[0];
+ const link = document.createElement('link');
+ link.rel = 'stylesheet';
+ link.type = 'text/css';
+ link.href = 'extensions/cg-use-everywhere/ue.css';
+ head.appendChild(link);
+
+ /*
+ Listen for message-handler event from python code
+ */
+ function messageHandler(event) {
+ const id = event.detail.id;
+ const message = event.detail.message;
+ const node = get_real_node(id);
+ if (node && node.displayMessage) node.displayMessage(id, message);
+ else (console.log(`node ${id} couldn't handle a message`));
+ }
+ api.addEventListener("ue-message-handler", messageHandler);
+
+ api.addEventListener("status", ({detail}) => {
+ if (linkRenderController) linkRenderController.note_queue_size(detail ? detail.exec_info.queue_remaining : 0)
+ });
+
+ /*
+ Don't modify the graph when saving the workflow or api
+ */
+ const _original_save_onclick = document.getElementById('comfy-save-button').onclick;
+ document.getElementById('comfy-save-button').onclick = function() {
+ graphAnalyser.pause();
+ _original_save_onclick();
+ graphAnalyser.unpause()
+ }
+ const _original_save_api_onclick = document.getElementById('comfy-dev-save-api-button').onclick;
+ document.getElementById('comfy-dev-save-api-button').onclick = function() {
+ graphAnalyser.pause();
+ // should check for UE links here and give a warning: #217
+ _original_save_api_onclick();
+ graphAnalyser.unpause();
+ }
+
+ /*
+ When we draw a node, render the virtual connection points
+ */
+ const original_drawNode = LGraphCanvas.prototype.drawNode;
+ LGraphCanvas.prototype.drawNode = function(node, ctx) {
+ UpdateBlocker.push()
+ try {
+ const v = original_drawNode.apply(this, arguments);
+ linkRenderController.highlight_ue_connections(node, ctx);
+ return v
+ } finally { UpdateBlocker.pop() }
+ }
+
+ /*
+ When we draw connections, do the ue ones as well (logic for on/off is in lrc)
+ */
+ const drawConnections = LGraphCanvas.prototype.drawConnections;
+ LGraphCanvas.prototype.drawConnections = function(ctx) {
+ drawConnections?.apply(this, arguments);
+ linkRenderController.render_all_ue_links(ctx);
+ }
+
+ /*
+ Add to the main settings
+ */
+ main_menu_settings();
+
+ /*
+ Canvas menu is the right click on backdrop.
+ We need to add our options, and hijack the others to mark link list dirty
+ */
+ const original_getCanvasMenuOptions = LGraphCanvas.prototype.getCanvasMenuOptions;
+ LGraphCanvas.prototype.getCanvasMenuOptions = function () {
+ // Add our items to the canvas menu
+ const options = original_getCanvasMenuOptions.apply(this, arguments);
+ canvas_menu_settings(options);
+
+ // every menu item makes our list dirty
+ inject_outdating_into_objects(options,'callback',`menu option on canvas`);
+
+ return options;
+ }
+
+ /*
+ When you drag from a node, showConnectionMenu is called. If shift key is pressed call ours
+ Broken #219
+ */
+ const showSearchBox = LGraphCanvas.prototype.showSearchBox;
+ LGraphCanvas.prototype.showSearchBox = function (optPass) {
+ if (optPass.shiftKey) {
+ autoCreateMenu.apply(this, arguments);
+ } else {
+ this.use_original_menu = true;
+ showSearchBox.apply(this, arguments);
+ this.use_original_menu = false;
+ }
+ }
+
+ /*
+ To allow us to use the shift drag above, we need to intercept 'allow_searchbox' sometimes
+ (because searchbox is the default behaviour when shift dragging)
+ Broken #219
+ */
+ var original_allow_searchbox = app.canvas.allow_searchbox;
+ defineProperty(app.canvas, 'allow_searchbox', {
+ get : function() {
+ if (this.use_original_menu) { return original_allow_searchbox; }
+ if(app.ui.settings.getSettingValue('AE.replacesearch', true) && this.connecting_output) {
+ return false;
+ } else { return original_allow_searchbox; }
+ },
+ set : function(v) { original_allow_searchbox = v; }
+ });
+
+
+ },
+
+ init() {
+ graphAnalyser = GraphAnalyser.instance();
+ app.graphToPrompt = async function () {
+ return graphAnalyser.analyse_graph(true, true, false);
+ }
+
+ linkRenderController = LinkRenderController.instance(graphAnalyser);
+
+ add_autoprompts();
+
+ if (false) add_debug();
+
+ }
+
+});
diff --git a/cg-use-everywhere/js/use_everywhere_apply.js b/cg-use-everywhere/js/use_everywhere_apply.js
new file mode 100644
index 0000000000000000000000000000000000000000..1199167220a8694a85756d342389e97a9c205600
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_apply.js
@@ -0,0 +1,31 @@
+import { app } from "../../scripts/app.js";
+import { is_UEnode, get_real_node } from "./use_everywhere_utilities.js";
+
+
+function _convert_to_links(ue) {
+ const output_node_id = ue.output[0];
+ const output_index = ue.output[1];
+ const output_node = get_real_node(output_node_id);
+ ue.sending_to.forEach((st) => {
+ const input_node_id = st.node.id;
+ const input_node = get_real_node(input_node_id);
+ const input_index = st.input_index;
+ output_node.connect(output_index, input_node, input_index);
+ });
+}
+
+function convert_to_links(ues, control_node_id) {
+ ues.ues.forEach((ue)=> {
+ if (control_node_id==-1 || ue.controller.id == control_node_id) _convert_to_links(ue);
+ });
+}
+
+function remove_all_ues() {
+ var match = app.graph._nodes.find((node)=>is_UEnode(node));
+ while (match) {
+ app.graph.remove(match);
+ match = app.graph._nodes.find((node)=>is_UEnode(node));
+ }
+}
+
+export {convert_to_links, remove_all_ues}
\ No newline at end of file
diff --git a/cg-use-everywhere/js/use_everywhere_autocreate.js b/cg-use-everywhere/js/use_everywhere_autocreate.js
new file mode 100644
index 0000000000000000000000000000000000000000..e885f0e9b3ad5e13595735932f4e4423c81038f3
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_autocreate.js
@@ -0,0 +1,46 @@
+import { app } from "../../scripts/app.js";
+
+function autoCreateMenu(opts) {
+ //opts.e.stopPropagation();
+ var options = ["Search",];
+ var search_opts;
+ if (opts.nodeFrom && opts.slotFrom) {
+ options.push(null);
+ options.push("Anything Everywhere");
+ options.push("Anything Everywhere?");
+ if (opts.nodeFrom?.outputs?.length==3 &&
+ opts.nodeFrom.outputs[0].name=='MODEL' &&
+ opts.nodeFrom.outputs[1].name=='CLIP' &&
+ opts.nodeFrom.outputs[2].name=='VAE') options.push("Anything Everywhere3");
+ search_opts = {node_from: opts.nodeFrom, slot_from: opts.slotFrom, type_filter_in: opts.slotFrom.type};
+ } else {
+ search_opts = {node_to: opts.nodeTo, slot_from: opts.slotTo, type_filter_out: slotTo.type};
+ }
+
+ var menu = new LiteGraph.ContextMenu(options, {
+ event: opts.e,
+ title: "UE Node",
+ callback: inner_clicked
+ });
+
+ const p = [ opts.e.canvasX, opts.e.canvasY ];
+
+ function inner_clicked(v,options,e) {
+ if (!v) return;
+ if (v=="Search") {
+ app.canvas.showSearchBox(opts.e,search_opts);
+ return;
+ }
+ var newNode = LiteGraph.createNode(v);
+ app.graph.add(newNode);
+ newNode.pos = p;
+ if (v=="Anything Everywhere3") {
+ for (var i=0; i<3; i++) {opts.nodeFrom.connect( i, newNode, i );}
+ } else {
+ opts.nodeFrom.connect( opts.nodeFrom.findOutputSlot(opts.slotFrom.name), newNode, 0 );
+ }
+ app.graph.change();
+ }
+}
+
+export {autoCreateMenu}
\ No newline at end of file
diff --git a/cg-use-everywhere/js/use_everywhere_autoprompt.js b/cg-use-everywhere/js/use_everywhere_autoprompt.js
new file mode 100644
index 0000000000000000000000000000000000000000..55a312ffc4d75dfd53695cca68c5b048413bc5d5
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_autoprompt.js
@@ -0,0 +1,183 @@
+import { is_UEnode } from "./use_everywhere_utilities.js";
+import { ComfyWidgets} from "../../scripts/widgets.js";
+import { app } from "../../scripts/app.js";
+import { LinkRenderController } from "./use_everywhere_ui.js";
+
+function update_picklist(node, inputname) {
+ const d = document.getElementById("uedynamiclist");
+ while (d.firstChild) { d.removeChild(d.lastChild); };
+ let options = [];
+ if (inputname=="title_regex") { options = LinkRenderController.instance().ue_list?.all_nodes_with_unmatched_input(node.input_type[0]); }
+ else if (inputname=="input_regex") { options = LinkRenderController.instance().ue_list?.all_unmatched_input_names(node.input_type[0]); }
+ else if (inputname=="group_regex") { options = LinkRenderController.instance().ue_list?.all_group_names(node.input_type[0]); }
+ options.forEach((option) => {
+ const theOption = document.createElement("option");
+ theOption.setAttribute("value", option);
+ d.appendChild(theOption)
+ })
+}
+
+function intersect(a, b) {
+ const x = Math.max(a.x, b.x);
+ const num1 = Math.min(a.x + a.width, b.x + b.width);
+ const y = Math.max(a.y, b.y);
+ const num2 = Math.min(a.y + a.height, b.y + b.height);
+ if (num1 >= x && num2 >= y) return [x, y, num1 - x, num2 - y];
+ else return null;
+}
+
+function union(a,b) {
+ if (!b) return a;
+ if (!a) return b;
+ const x = Math.min(a.x, b.x);
+ const y = Math.min(a.y,b.y);
+ const width = Math.max(a.x+a.width, b.x+b.width) - x;
+ const height = Math.max(a.y+a.height, b.y+b.height) - x;
+ return { x:x, y:y, width:width, height:height };
+}
+
+function getClipPath(node, element) {
+ const scale = app.canvas.ds.scale;
+ const widgetRect = element.getBoundingClientRect();
+ var onTopOfMe = false;
+ var clip = null;
+ app.graph._nodes.forEach((other_node) => {
+ if (other_node.id == node.id) {
+ onTopOfMe = true;
+ }
+ else if (onTopOfMe) {
+ const MARGIN = other_node.is_selected ? 7 : 2;
+ const bounding = other_node.getBounding();
+ const intersection = intersect(
+ { x: widgetRect.x / scale, y: widgetRect.y / scale, width: widgetRect.width / scale, height: widgetRect.height / scale },
+ {
+ x: other_node.pos[0] + app.canvas.ds.offset[0] - MARGIN,
+ y: other_node.pos[1] + app.canvas.ds.offset[1] - LiteGraph.NODE_TITLE_HEIGHT - MARGIN,
+ width: bounding[2] + MARGIN + MARGIN,
+ height: bounding[3] + MARGIN + MARGIN,
+ }
+ );
+ if (intersection) {
+ clip = union(clip, {
+ x : intersection[0] - widgetRect.x / scale,
+ y : intersection[1] - widgetRect.y / scale,
+ width : intersection[2],
+ height : intersection[3]
+ })
+ //const newpath = `0% 0%, 0% 100%, ${clipX} 100%, ${clipX} ${clipY}, calc(${clipX} + ${clipWidth}) ${clipY}, calc(${clipX} + ${clipWidth}) calc(${clipY} + ${clipHeight}), ${clipX} calc(${clipY} + ${clipHeight}), ${clipX} 100%, 100% 100%, 100% 0%`;
+ //path = path != '' ? `${path}, ${newpath}` : newpath;
+ }
+ }
+ })
+ const path = clip ? `polygon(0% 0%, 0% 100%, ${clip.x}px 100%, ${clip.x}px ${clip.y}px, ${clip.x + clip.width}px ${clip.y}px, ${clip.x + clip.width}px ${clip.y + clip.height}px, ${clip.x}px ${clip.y + clip.height}px, ${clip.x}px 100%, 100% 100%, 100% 0%)` : '';
+ return path;
+}
+
+function active_text_widget(node, inputname) {
+ const label = document.createElement("label");
+ label.className = "graphdialog ueprompt";
+
+
+ const label_text = document.createElement("span");
+ label_text.innerText = `${inputname.substring(0,5)} `;
+ label_text.className = "ueprompttext";
+ label.appendChild(label_text);
+
+ const span = document.createElement("span");
+ span.className = "uepromptspan";
+ label.appendChild(span);
+
+ const inputEl = document.createElement("input");
+ inputEl.setAttribute("type", "text");
+ inputEl.className = "uepromptinput";
+ span.appendChild(inputEl);
+
+ const widget = node.addDOMWidget(inputname, "input", label, {
+ getValue() { return inputEl.value; },
+ setValue(v) { inputEl.value = v; },
+ onDraw(w) {
+ // are we the most recently selected node?
+ if (Object.values(app.canvas.selected_nodes)[0]?.id == node.id) {
+ // if so, turn off DOM clipping
+ w.element.style.clipPath = null; w.element.style.willChange = null;
+ } else {
+ w.element.style.zIndex = 0;
+ const p = getClipPath(node, w.element);
+ w.element.style.clipPath = p;
+ let a;
+ }
+ }
+ });
+ //widget.element.hidden = true;
+ //widget.element.classList.add("ue_hide")
+ widget.element.style.display="block"
+
+ inputEl.onmousedown = function(e) {
+ const x = app.canvas.prompt("Value",widget.value,function(v) { this.value = v; }.bind(widget), e, false );
+ const input = x.getElementsByClassName("value")[0];
+ input.setAttribute("list", "uedynamiclist");
+ input.parentNode.style.zIndex = `${parseInt(label.style.zIndex ? label.style.zIndex : '0')+1}`;
+ input.addEventListener("input", function (v) {
+ widget.value = this.value;
+ LinkRenderController.instance().mark_link_list_outdated();
+ app.graph.setDirtyCanvas(true,true);
+ }.bind(input));
+ update_picklist(node, inputname);
+ e.stopImmediatePropagation();
+ }
+
+ widget.computeSize = function (parent_width) {
+ return parent_width ? [parent_width, 27] : [400, 20];
+ }
+
+ inputEl.addEventListener("focus", () => {
+ if (inputEl.value==".*") inputEl.value = "";
+ });
+
+ widget.onModeChange = function (mode) {
+ label.style.opacity = mode==4 ? 0.2 : 1.0;
+ }
+
+ node.loaded_when_collapsed = function() {
+ node.widgets?.forEach((widget) => {
+ if (widget.element) {
+ widget.element.hidden = true;
+ widget.element.classList.add("ue_hide")
+ }
+ })
+ }
+
+ return { widget };
+}
+
+function activate(node, widget) {
+ if (node.flags?.collapsed) return;
+ widget.element.hidden = false;
+ widget.element.classList.remove("ue_hide")
+}
+
+function add_autoprompts() {
+ const STRING = ComfyWidgets.STRING;
+ ComfyWidgets.STRING = function (node, inputName, inputData, app) {
+ if (true || !is_UEnode(node) || !inputName?.includes("regex") || !app.ui.settings.getSettingValue('AE.autoprompt', true)) {
+ return STRING.apply(this, arguments);
+ }
+ const atw = active_text_widget(node, inputName);
+ const orig_onAdded = node.onAdded;
+ node.onAdded = function () {
+ orig_onAdded?.apply(this, arguments);
+ activate(node, atw.widget);
+ }
+ return atw;
+ }
+ const datalist = document.createElement("datalist");
+ datalist.id = "uedynamiclist";
+ document.body.append(datalist);
+}
+
+function node_added(node) {
+ const a = 1;
+}
+
+
+export { add_autoprompts }
\ No newline at end of file
diff --git a/cg-use-everywhere/js/use_everywhere_classes.js b/cg-use-everywhere/js/use_everywhere_classes.js
new file mode 100644
index 0000000000000000000000000000000000000000..0dcf915b925b5bef029e97071cb9c832a8eacdd2
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_classes.js
@@ -0,0 +1,262 @@
+import { nodes_in_my_group, nodes_not_in_my_group, nodes_my_color, nodes_not_my_color, nodes_in_groups_matching } from "./use_everywhere_ui.js";
+import { Logger, node_is_live, get_real_node } from "./use_everywhere_utilities.js";
+
+function display_name(node) {
+ if (node?.title) return node.title;
+ if (node?.type) return node.type;
+ if (node?.properties['Node name for S&R']) return node.properties['Node name for S&R'];
+ return "un-nameable node";
+}
+
+/*
+The UseEverywhere object represents a single 'broadcast'. It generally contains
+ controller - the UE node that controls the broadcase
+ control_node_input_index - the input on that node
+ type - the data type
+ output - the output that is being rebroadcast as a list (node_id, output_index)
+ title_regex, input_regex - the UE? matching rules
+ priority - priorty :)
+*/
+class UseEverywhere {
+ constructor() {
+ this.sending_to = [];
+ Object.assign(this, arguments[0]);
+ if (this.priority === undefined) this.priority = 0;
+ this.description = `source ${this?.output[0]}.${this?.output[1]} -> control ${this?.controller.id}.${this?.control_node_input_index} "${this.type}" <- (priority ${this.priority})`;
+ if (this.title_regex) this.description += ` - node title regex '${this.title_regex.source}'`;
+ if (this.input_regex) this.description += ` - input name regex '${this.input_regex.source}'`;
+ }
+
+ sending_differs_from(another_ue) {
+ if (this.sending_to.length != another_ue.sending_to.length) return true;
+ for (var i=0; i n.name==input.name);
+ this.sending_to.push({node:node, input:input, input_index:input_index})
+ }
+ describe_sending(){
+ var description = " Linked to:";
+ this.sending_to.forEach((st) => description += `\n -> ${display_name(st.node)}, ${st.input.name}`);
+ if (this.sending_to.length===0) description += ' nothing';
+ return description;
+ }
+ describe() {
+ return this.description + "\n" + this.describe_sending();
+ }
+}
+
+function validity_errors(params) {
+ if (!node_is_live(params.controller)) return `UE node ${params.output[0]} is not alive`;
+ if (!node_is_live(get_real_node(params.output[0]))) return `upstream node ${params.output[0]} is not alive`;
+ return "";
+}
+
+class UseEverywhereList {
+ constructor() { this.ues = []; this.unmatched_inputs = []; }
+
+ differs_from(another_uel) {
+ if (!another_uel || !another_uel.ues || !this.ues) return true;
+ if (this.ues.length != another_uel.ues.length) return true;
+ for (var i=0; i (
+ candidate.matches(node, input)
+ ));
+ if (matches.length==0) {
+ Logger.log(Logger.INFORMATION, `'${display_name(node)}' optional input '${input.name}' unmatched`)
+ return undefined;
+ }
+ if (matches.length>1) {
+ matches.sort((a,b) => b.priority-a.priority);
+ if(matches[0].priority == matches[1].priority) {
+ const msg = `'${display_name(node)}' (${node.id}) input '${input.name}' matches multiple Use Everwhere sources:`;
+ _ambiguity_messages.push(msg);
+ for (var i=0; i { console.log(ue.describe()); });
+ }
+
+ all_unmatched_inputs(type) {
+ return this.unmatched_inputs.filter((ui)=>ui.input.type==type);
+ }
+
+ all_nodes_with_unmatched_input(type) {
+ const result = new Set();
+ this.all_unmatched_inputs(type).forEach((ui) => {
+ result.add(display_name(ui.node));
+ })
+ return result;
+ }
+
+ all_unmatched_input_names(type) {
+ const result = new Set();
+ this.all_unmatched_inputs(type).forEach((ui) => {
+ result.add(ui.input.label ? ui.input.label : ui.input.name);
+ })
+ return result;
+ }
+
+ all_group_names() {
+ const result = new Set();
+ app.graph._groups.forEach((group) => {
+ result.add(group.title);
+ })
+ return result;
+ }
+
+ all_connected_inputs(for_node) {
+ const ue_connections = [];
+ this.ues.forEach((ue) => {
+ ue.sending_to.forEach((st) => {
+ if (st.node.id == for_node.id) {
+ ue_connections.push({
+ type : ue.type,
+ input_index : st.input_index,
+ control_node : get_real_node(ue.controller.id),
+ control_node_input_index : ue.control_node_input_index,
+ sending_to : st.node,
+ });
+ }
+ });
+ });
+ return ue_connections;
+ }
+
+ all_ue_connections() {
+ const ue_connections = [];
+ this.ues.forEach((ue) => {
+ ue.sending_to.forEach((st) => {
+ ue_connections.push({
+ type : ue.type,
+ input_index : st.input_index,
+ control_node : get_real_node(ue.controller.id),
+ control_node_input_index : ue.control_node_input_index,
+ sending_to : st.node,
+ });
+ });
+ });
+ return ue_connections;
+ }
+
+ all_ue_connections_for(node_id) {
+ const ue_connections = [];
+ this.ues.forEach((ue) => {
+ ue.sending_to.forEach((st) => {
+ if (get_real_node(st.node.id).id==node_id || get_real_node(ue.controller.id).id==node_id) {
+ ue_connections.push({
+ type : ue.type,
+ input_index : st.input_index,
+ control_node : get_real_node(ue.controller.id),
+ control_node_input_index : ue.control_node_input_index,
+ sending_to : st.node,
+ });
+ }
+ });
+ });
+ return ue_connections;
+ }
+}
+
+export {UseEverywhereList}
\ No newline at end of file
diff --git a/cg-use-everywhere/js/use_everywhere_graph_analysis.js b/cg-use-everywhere/js/use_everywhere_graph_analysis.js
new file mode 100644
index 0000000000000000000000000000000000000000..e5eb8640dc9f4065e2fc21288217f8a1ad25e265
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_graph_analysis.js
@@ -0,0 +1,152 @@
+import { GroupNodeHandler } from "../core/groupNode.js";
+import { UseEverywhereList } from "./use_everywhere_classes.js";
+import { add_ue_from_node, add_ue_from_node_in_group } from "./use_everywhere_nodes.js";
+import { node_in_loop, node_is_live, is_connected, is_UEnode, Logger, get_real_node } from "./use_everywhere_utilities.js";
+import { app } from "../../scripts/app.js";
+
+class GraphAnalyser {
+ static _instance;
+ static instance() {
+ if (!this._instance) this._instance = new GraphAnalyser();
+ return this._instance;
+ }
+
+ constructor() {
+ this.original_graphToPrompt = app.graphToPrompt;
+ this.ambiguity_messages = [];
+ this.pause_depth = 0;
+ }
+
+ pause() { this.pause_depth += 1; }
+ unpause() { this.pause_depth -= 1; }
+
+
+ async analyse_graph(modify_and_return_prompt=false, check_for_loops=false, supress_before_queued=true) {
+ //try {
+ /*if (supress_before_queued) {
+ app.graph._nodes.forEach((node) => {
+ node.widgets?.forEach((widget) => {
+ if (widget.beforeQueued) {
+ widget.__beforeQueued = widget.beforeQueued;
+ widget.beforeQueued = null;
+ }
+ })
+ if(node.seedControl && node.seedControl.lastSeedButton){ // for efficiency nodes seedControl
+ node.seedControl.lastSeedButton.__disabled = node.seedControl.lastSeedButton.disabled
+ node.seedControl.lastSeedButton.disabled = true
+ }
+ })
+ }*/
+ //return this._analyse_graph(modify_and_return_prompt, check_for_loops);
+ /*} finally {
+ if (supress_before_queued) {
+ app.graph._nodes.forEach((node) => {
+ node.widgets?.forEach((widget) => {
+ if (widget.__beforeQueued) {
+ widget.beforeQueued = widget.__beforeQueued;
+ widget.__beforeQueued = null;
+ }
+ })
+ if(node.seedControl && node.seedControl.lastSeedButton){ // for efficiency nodes seedControl
+ node.seedControl.lastSeedButton.disabled = node.seedControl.lastSeedButton.__disabled
+ }
+ })
+ }
+ }*/
+ //}
+ //async _analyse_graph(modify_and_return_prompt=false, check_for_loops=false) {
+ if (this.pause_depth > 0) { return this.original_graphToPrompt.apply(app) }
+ this.ambiguity_messages = [];
+ var p;
+ if (modify_and_return_prompt) {
+ p = await this.original_graphToPrompt.apply(app);
+ p = structuredClone(p);
+ } else {
+ p = { workflow:app.graph.serialize() }
+ }
+
+ // Create a UseEverywhereList and populate it from all live (not bypassed) nodes
+ const ues = new UseEverywhereList();
+ const live_nodes = p.workflow.nodes.filter((node) => node_is_live(node))
+ live_nodes.filter((node) => is_UEnode(node)).forEach(node => { add_ue_from_node(ues, node); })
+ live_nodes.filter((node) => (get_real_node(node.id, Logger.INFORMATION) && GroupNodeHandler.isGroupNode(get_real_node(node.id)))).forEach( groupNode => {
+ const group_data = GroupNodeHandler.getGroupData(get_real_node(groupNode.id));
+ group_data.nodeData.nodes.filter((node) => is_UEnode(node)).forEach(node => {
+ add_ue_from_node_in_group(ues, node, groupNode.id, group_data);
+ })
+ })
+
+ const links_added = new Set();
+ // Look for unconnected inputs and see if we can connect them
+ live_nodes.filter((node) => !is_UEnode(node)).forEach(node => {
+ const nd = get_real_node(node.id, Logger.INFORMATION);
+ if (nd && !nd.properties.rejects_ue_links) {
+ var gpData = GroupNodeHandler.getGroupData(nd);
+ const isGrp = !!gpData;
+ const o2n = isGrp ? Object.entries(gpData.oldToNewInputMap) : null;
+ node.inputs?.forEach(input => {
+ if (!is_connected(input) && !(node.reject_ue_connection && node.reject_ue_connection(input))) {
+ var ue = ues.find_best_match(node, input, this.ambiguity_messages);
+ if (ue && modify_and_return_prompt) {
+ var effective_node = node;
+ var effective_node_slot = -1;
+ if (isGrp) { // the node we are looking at is a group node
+ const in_index = node.inputs.findIndex((i)=>i==input);
+ const inner_node_index = o2n.findIndex((l)=>Object.values(l[1]).includes(in_index));
+ const inner_node_slot_index = Object.values(o2n[inner_node_index][1]).findIndex((l)=>l==in_index);
+ effective_node_slot = Object.keys(o2n[inner_node_index][1])[inner_node_slot_index];
+ effective_node = nd.getInnerNodes()[o2n[inner_node_index][0]];
+ }
+ const upNode = get_real_node(ue.output[0]);
+ var effective_output = [ue.output[0], ue.output[1]];
+ if (GroupNodeHandler.isGroupNode(upNode)) { // the upstream node is a group node
+ const upGpData = GroupNodeHandler.getGroupData(upNode);
+ const up_inner_node = upGpData.newToOldOutputMap[ue.output[1]].node;
+ const up_inner_node_index = up_inner_node.index;
+ const up_inner_node_id = upNode.getInnerNodes()[up_inner_node_index].id;
+ const up_inner_node_slot = upGpData.newToOldOutputMap[ue.output[1]].slot;
+ effective_output = [`${up_inner_node_id}`, up_inner_node_slot];
+ }
+ if (effective_node_slot==-1) effective_node_slot = effective_node.inputs.findIndex((i)=>(i.label ? i.label : i.name)===(input.label ? input.label : input.name));
+ p.output[effective_node.id].inputs[effective_node.inputs[effective_node_slot].name] = effective_output;
+ links_added.add({
+ "downstream":effective_node.id, "downstream_slot":effective_node_slot,
+ "upstream":effective_output[0], "upstream_slot":effective_output[1],
+ "controller":ue.controller.id,
+ "type":ue.type
+ });
+ }
+ }
+ });
+ }
+ });
+
+ if (this.ambiguity_messages.length) Logger.log(Logger.PROBLEM, "Ambiguous connections", this.ambiguity_messages, Logger.CAT_AMBIGUITY);
+
+ // if there are loops report them and raise an exception
+ if (check_for_loops && app.ui.settings.getSettingValue('AE.checkloops', true)) {
+ try {
+ node_in_loop(live_nodes, links_added);
+ } catch (e) {
+ if (!e.stack) throw e;
+ if (e.ues && e.ues.length > 0){
+ alert(`Loop (${e.stack}) with broadcast (${e.ues}) - not submitting workflow`);
+ } else {
+ alert(`Loop (${e.stack}) - not submitting workflow`);
+ }
+ throw new Error(`Loop Detected ${e.stack}, ${e.ues}`, {"cause":e});
+ }
+ }
+
+ if (modify_and_return_prompt) {
+ [...links_added].forEach((l)=>{
+ p.workflow.last_link_id += 1;
+ p.workflow.links.push([p.workflow.last_link_id, parseInt(l.upstream), l.upstream_slot, l.downstream, l.downstream_slot, l.type])
+ })
+ return p;
+ }
+ else return ues;
+ }
+}
+
+export { GraphAnalyser }
diff --git a/cg-use-everywhere/js/use_everywhere_nodes.js b/cg-use-everywhere/js/use_everywhere_nodes.js
new file mode 100644
index 0000000000000000000000000000000000000000..94711c367794b6cc1b3777190c06a64d1924d6e7
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_nodes.js
@@ -0,0 +1,113 @@
+import { handle_bypass, get_real_node, get_group_node } from "./use_everywhere_utilities.js";
+import { app } from "../../scripts/app.js";
+
+const CONVERTED_TYPE = "converted-widget";
+// import {CONVERTED_TYPE} from "../../extensions/core/widgetInputs.js"
+
+/*
+If a widget hasn't been converted, just get it's value
+If it has, *try* to go upstream
+*/
+function get_widget_or_input_values(node_obj, widget_id) {
+ if (node_obj.widgets[widget_id]?.type.startsWith(CONVERTED_TYPE)) {
+ try {
+ const name = node_obj.widgets[widget_id].name;
+ const input_id = node_obj.inputs.findIndex((input) => input.name==name);
+ const connection = get_connection(node_obj, input_id, "STRING");
+ const upstream_node_obj = get_real_node(connection.link.origin_id.toString());
+ const widget = upstream_node_obj.widgets.find((w) => w.name.toLowerCase() == upstream_node_obj.outputs[connection.link.origin_slot].name.toLowerCase());
+ return widget.value;
+ } catch (error) {
+ return "NOT CONNECTED DONT MATCH";
+ }
+ }
+ return node_obj.widgets[widget_id].value;
+}
+
+function add_ue_from_node_in_group(ues, node, group_node_id, group_data) {
+ const group_node = get_real_node(group_node_id);
+ const ue_node = group_node.getInnerNodes()[node.index];
+ ue_node.in_group_with_data = group_data;
+ ue_node.getInnerNodesOfGroup = group_node.getInnerNodes;
+ add_ue_from_node(ues, ue_node)
+}
+
+function get_available_input_name(inputs, the_input, type) {
+ const used_names = [];
+ inputs.forEach((input) => { if (input!=the_input) used_names.push(input.name); });
+ const base = `UE ${type.toLowerCase()}`;
+ if (!used_names.includes(base)) return base;
+ for (var i=2; ;i++) {
+ if (!used_names.includes(`${base}${i}`)) return `${base}${i}`;
+ }
+}
+
+function get_connection(node, i, override_type) {
+ const in_link = node?.inputs[i].link;
+ var type = override_type;
+ var link = undefined;
+ if (in_link) {
+ if (!override_type) type = get_real_node(node.id.toString())?.input_type[i];
+ link = handle_bypass(app.graph.links[in_link],type);
+ } else if (node.in_group_with_data) {
+ if (node.in_group_with_data.linksTo[node.index] && node.in_group_with_data.linksTo[node.index][i]) {
+ const group_style_link = node.in_group_with_data.linksTo[node.index][i];
+ link = { "origin_id":node.getInnerNodesOfGroup()[group_style_link[0]].id, "origin_slot" : group_style_link[1] };
+ if (!override_type) type = group_style_link[5];
+ } else { // group external input
+ const group_node = get_group_node(node.id);
+ const group_node_input = group_node.inputs[node.in_group_with_data.oldToNewInputMap[node.index][i]];
+ const link_n = group_node_input.link;
+ if (link_n) {
+ link = app.graph.links[link_n];
+ if (!override_type) type = app.graph._nodes_by_id[link.origin_id].outputs[link.origin_slot].type;
+ // update the group input node... and the link type
+ group_node_input.type = type;
+ group_node_input.name = get_available_input_name(group_node.inputs, group_node_input, type);
+ link.type = type;
+ }
+ }
+ }
+ return { link:link, type:type }
+}
+
+/*
+Add UseEverywhere broadcasts from this node to the list
+*/
+function add_ue_from_node(ues, node) {
+ if (node.type === "Seed Everywhere") ues.add_ue(node, -1, "INT", [node.id.toString(),0],
+ undefined, new RegExp("seed|随机种"), undefined, 5);
+
+ if (node.type === "Anything Everywhere?") {
+ const connection = get_connection(node, 0);
+ if (connection.link) {
+ const node_obj = get_real_node(node.id.toString());
+ const w0 = get_widget_or_input_values(node_obj,0);
+ const r0 = new RegExp(w0);
+ const w1 = get_widget_or_input_values(node_obj,1);
+ const r1 = (w1.startsWith('+')) ? w1 : new RegExp(w1);
+ const w2 = get_widget_or_input_values(node_obj,2);
+ const r2 = (w2 && w2!=".*") ? new RegExp(w2) : null;
+ ues.add_ue(node, 0, connection.type, [connection.link.origin_id.toString(), connection.link.origin_slot], r0, r1, r2, 10);
+ }
+ }
+ if (node.type === "Prompts Everywhere") {
+ for (var i=0; i<2; i++) {
+ const connection = get_connection(node, i);
+ if (connection.link) ues.add_ue(node, i, connection.type, [connection.link.origin_id.toString(), connection.link.origin_slot],
+ undefined, new RegExp(["(_|\\b)pos(itive|_|\\b)|^prompt|正面","(_|\\b)neg(ative|_|\\b)|负面"][i]), undefined, 5);
+ }
+ }
+ if (node.type === "Anything Everywhere") {
+ const connection = get_connection(node, 0);
+ if (connection.link) ues.add_ue(node, 0, connection.type, [connection.link.origin_id.toString(),connection. link.origin_slot], undefined, undefined, undefined, 2);
+ }
+ if (node.type === "Anything Everywhere3") {
+ for (var i=0; i<3; i++) {
+ const connection = get_connection(node, i);
+ if (connection.link) ues.add_ue(node, i, connection.type, [connection.link.origin_id.toString(), connection.link.origin_slot]);
+ }
+ }
+}
+
+export {add_ue_from_node, add_ue_from_node_in_group}
diff --git a/cg-use-everywhere/js/use_everywhere_settings.js b/cg-use-everywhere/js/use_everywhere_settings.js
new file mode 100644
index 0000000000000000000000000000000000000000..f1441a00325059cfa246ff500f4ad43c8baf6d4c
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_settings.js
@@ -0,0 +1,182 @@
+import { app } from "../../scripts/app.js";
+import { GraphAnalyser } from "./use_everywhere_graph_analysis.js";
+import { LinkRenderController } from "./use_everywhere_ui.js";
+import { convert_to_links, remove_all_ues } from "./use_everywhere_apply.js";
+import { has_priority_boost } from "./use_everywhere_utilities.js";
+
+function main_menu_settings() {
+
+ app.ui.settings.addSetting({
+ id: "AE.details",
+ name: "Anything Everywhere show node details",
+ type: "boolean",
+ defaultValue: false,
+ });
+ app.ui.settings.addSetting({
+ id: "AE.autoprompt",
+ name: "Anything Everywhere? autocomplete (may require page reload)",
+ type: "boolean",
+ defaultValue: true,
+ });
+ app.ui.settings.addSetting({
+ id: "AE.checkloops",
+ name: "Anything Everywhere check loops",
+ type: "boolean",
+ defaultValue: true,
+ });
+ app.ui.settings.addSetting({
+ id: "AE.showlinks",
+ name: "Anything Everywhere show links",
+ type: "combo",
+ options: [ {value:0, text:"All off"}, {value:1, text:"Selected nodes"}, {value:2, text:"Mouseover node"}, {value:3, text:"Selected and mouseover nodes"}, {value:4, text:"All on"}],
+ defaultValue: 0,
+ onChange: app.graph.change.bind(app.graph),
+ });
+ app.ui.settings.addSetting({
+ id: "AE.animate",
+ name: "Anything Everywhere animate UE links",
+ type: "combo",
+ options: [ {value:0, text:"Off"}, {value:1, text:"Dots"}, {value:2, text:"Pulse"}, {value:3, text:"Both"}, ],
+ defaultValue: 3,
+ onChange: app.graph.change.bind(app.graph),
+ });
+ app.ui.settings.addSetting({
+ id: "AE.stop.animation.running",
+ name: "Anything Everywhere turn animation off when running",
+ type: "boolean",
+ defaultValue: true,
+ onChange: app.graph.change.bind(app.graph),
+ });
+ app.ui.settings.addSetting({
+ id: "AE.highlight",
+ name: "Anything Everywhere highlight connected nodes",
+ type: "boolean",
+ defaultValue: true,
+ onChange: app.graph.change.bind(app.graph),
+ });
+ app.ui.settings.addSetting({
+ id: "AE.replacesearch",
+ name: "Anything Everywhere replace search",
+ type: "boolean",
+ defaultValue: true,
+ });
+}
+
+function submenu(properties, property, options, e, menu, node) {
+ const current = properties[property] ? (properties[property]==2 ? 3 : 2 ) : 1;
+ const submenu = new LiteGraph.ContextMenu(
+ options,
+ { event: e, callback: inner_function, parentMenu: menu, node: node }
+ );
+ const current_element = submenu.root.querySelector(`:nth-child(${current})`);
+ if (current_element) current_element.style.borderLeft = "2px solid #484";
+ function inner_function(v) {
+ if (node) {
+ const choice = Object.values(options).indexOf(v);
+ properties[property] = choice;
+ LinkRenderController.instance().mark_link_list_outdated();
+ }
+ }
+}
+
+const GROUP_RESTRICTION_OPTIONS = ["No restrictions", "Send only within group", "Send only not within group"]
+function group_restriction_submenu(value, options, e, menu, node) {
+ submenu(node.properties, "group_restricted", GROUP_RESTRICTION_OPTIONS, e, menu, node);
+}
+
+const COLOR_RESTRICTION_OPTIONS = ["No restrictions", "Send only to same color", "Send only to different color"]
+function color_restriction_submenu(value, options, e, menu, node) {
+ submenu(node.properties, "color_restricted", COLOR_RESTRICTION_OPTIONS, e, menu, node);
+}
+
+function priority_boost_submenu(value, options, e, menu, node) {
+ const current = (node.properties["priority_boost"] ? node.properties["priority_boost"] : 0) + 1;
+ const submenu = new LiteGraph.ContextMenu(
+ [0,1,2,3,4,5,6,7,8,9],
+ { event: e, callback: function (v) {
+ node.properties["priority_boost"] = parseInt(v);
+ LinkRenderController.instance().mark_link_list_outdated();
+ },
+ parentMenu: menu, node:node}
+ )
+ const current_element = submenu.root.querySelector(`:nth-child(${current})`);
+ if (current_element) current_element.style.borderLeft = "2px solid #484";
+}
+
+function non_ue_menu_settings(options, node) {
+ options.push(null);
+ options.push(
+ {
+ content: node.properties.rejects_ue_links ? "Allow UE Links" : "Reject UE Links",
+ has_submenu: false,
+ callback: () => { node.properties.rejects_ue_links = !!!node.properties.rejects_ue_links },
+ }
+ )
+}
+
+function node_menu_settings(options, node) {
+ options.push(null);
+ if (has_priority_boost(node)) options.push(
+ {
+ content: "Priority Boost",
+ has_submenu: true,
+ callback: priority_boost_submenu,
+ }
+ )
+ options.push(
+ {
+ content: "Group Restrictions",
+ has_submenu: true,
+ callback: group_restriction_submenu,
+ },
+ {
+ content: "Color Restrictions",
+ has_submenu: true,
+ callback: color_restriction_submenu,
+ },
+ {
+ content: "Convert to real links",
+ callback: async () => {
+ const ues = await GraphAnalyser.instance().analyse_graph();
+ convert_to_links(ues, node.id);
+ app.graph.remove(node);
+ }
+ }
+ )
+ options.push(null);
+}
+
+function canvas_menu_settings(options) {
+ options.push(null); // divider
+ options.push({
+ content: (app.ui.settings.getSettingValue('AE.showlinks', 0)>0) ? "Hide UE links" : "Show UE links",
+ callback: () => {
+ const setTo = (app.ui.settings.getSettingValue('AE.showlinks', 0)>0) ? 0 : 4;
+ app.ui.settings.setSettingValue('AE.showlinks', setTo);
+ app.graph.change();
+ }
+ },
+ {
+ content: "Convert all UEs to real links",
+ callback: async () => {
+ if (window.confirm("This will convert all links created by Use Everywhere to real links, and delete all the Use Everywhere nodes. Is that what you want?")) {
+ const ues = await GraphAnalyser.instance().analyse_graph();
+ LinkRenderController.instance().pause();
+ convert_to_links(ues, -1);
+ remove_all_ues();
+ app.graph.change();
+ }
+ }
+ });
+ if (GraphAnalyser.instance().ambiguity_messages.length) {
+ options.push({
+ content: "Show UE broadcast clashes",
+ callback: async () => {
+ alert(GraphAnalyser.instance().ambiguity_messages.join("\n"))
+ }
+ })
+ }
+ options.push(null); // divider
+}
+
+export { main_menu_settings, node_menu_settings, canvas_menu_settings, non_ue_menu_settings }
\ No newline at end of file
diff --git a/cg-use-everywhere/js/use_everywhere_ui.js b/cg-use-everywhere/js/use_everywhere_ui.js
new file mode 100644
index 0000000000000000000000000000000000000000..ca9c6e4114ff3c52d4dd6765f82913554b49ef13
--- /dev/null
+++ b/cg-use-everywhere/js/use_everywhere_ui.js
@@ -0,0 +1,358 @@
+import { Logger, get_real_node, get_group_node, get_all_nodes_within } from "./use_everywhere_utilities.js";
+import { ComfyWidgets } from "../../scripts/widgets.js";
+import { app } from "../../scripts/app.js";
+
+export class UpdateBlocker {
+ static depth = 0
+ static push() { UpdateBlocker.depth += 1 }
+ static pop() { UpdateBlocker.depth -= 1 }
+ static blocking() { return UpdateBlocker.depth>0 }
+}
+
+function nodes_in_my_group(node_id) {
+ const nodes_in = new Set();
+ app.graph._groups.forEach((group) => {
+ if (!app.canvas.selected_group_moving) group.recomputeInsideNodes();
+ if (group._nodes?.find((node) => { return (node.id===node_id) } )) {
+ group._nodes.forEach((node) => { nodes_in.add(node.id) } )
+ }
+ });
+ return [...nodes_in];
+}
+
+function nodes_not_in_my_group(node_id) {
+ const nid = nodes_in_my_group(node_id);
+ const nodes_not_in = [];
+ app.graph._nodes.forEach((node) => {
+ if (!nid.includes(node.id)) nodes_not_in.push(node.id);
+ });
+ return nodes_not_in;
+}
+
+function nodes_in_groups_matching(regex, already_limited_to) {
+ const nodes_in = new Set();
+ app.graph._groups.forEach((group) => {
+ if (regex.test(group.title)) {
+ if (!app.canvas.selected_group_moving) group.recomputeInsideNodes();
+ /*
+ Note for optimisation - it would be more efficient to calculate what nodes are in what groups
+ once at the start of analyse_graph() rather than for every group for every UE? with a group regex.
+ */
+ group._nodes.forEach((node) => {
+ if (!already_limited_to || already_limited_to.includes(node.id)) {
+ nodes_in.add(node.id)
+ }
+ } );
+ }
+ });
+ return [...nodes_in];
+}
+
+
+function nodes_my_color(node_id, already_limited_to) {
+ const nodes_in = new Set();
+ const color = get_real_node(node_id).color;
+ if (already_limited_to) {
+ already_limited_to.forEach((nid) => {
+ if (get_real_node(nid).color==color) nodes_in.add(nid)
+ })
+ } else {
+ app.graph._nodes.forEach((node) => {
+ if (node.color==color) nodes_in.add(node.id)
+ })
+ }
+ return [...nodes_in];
+}
+
+function nodes_not_my_color(node_id, already_limited_to) {
+ const nodes_in = new Set();
+ const color = get_real_node(node_id).color;
+ if (already_limited_to) {
+ already_limited_to.forEach((nid) => {
+ if (get_real_node(nid).color!=color) nodes_in.add(nid)
+ })
+ } else {
+ app.graph._nodes.forEach((node) => {
+ if (node.color!=color) nodes_in.add(node.id)
+ })
+ }
+ return [...nodes_in];
+}
+
+function indicate_restriction(ctx, title_height) {
+ ctx.save();
+ ctx.lineWidth = 2;
+ ctx.strokeStyle = "#6F6";
+ ctx.beginPath();
+ ctx.roundRect(5,5-title_height,20,20,8);
+ ctx.stroke();
+ ctx.restore();
+}
+
+function displayMessage(id, message) {
+ const node = get_real_node(id);
+ if (!node) return;
+ var w = node.widgets?.find((w) => w.name === "display_text_widget");
+ if (app.ui.settings.getSettingValue('AE.details', false) || w) {
+ if (!w) {
+ w = ComfyWidgets["STRING"](this, "display_text_widget", ["STRING", { multiline: true }], app).widget;
+ w.inputEl.readOnly = true;
+ w.inputEl.style.opacity = 0.6;
+ w.inputEl.style.fontSize = "9pt";
+ }
+ w.value = message;
+ this.onResize?.(this.size);
+ }
+}
+
+function update_input_label(node, slot, app) {
+ if (node.input_type[slot]) {
+ node.inputs[slot].name = node.input_type[slot];
+ node.inputs[slot].color_on = app.canvas.default_connection_color_byType[node.input_type[slot]];
+ } else {
+ node.inputs[slot].name = "anything";
+ node.inputs[slot].color_on = undefined;
+ }
+}
+
+class LinkRenderController {
+ static _instance;
+ static instance(tga) {
+ if (!this._instance) this._instance = new LinkRenderController();
+ if (tga && !this._instance.the_graph_analyser) this._instance.the_graph_analyser = tga;
+ return this._instance
+ }
+ constructor() {
+ this.the_graph_analyser = null;
+ this.periodically_mark_link_list_outdated();
+ }
+
+ ue_list = undefined; // the most current ue list - set to undefined if we know it is out of date
+ ue_list_reloading = false; // true when a reload has been requested but not completed
+ last_used_ue_list = undefined; // the last ue list we actually used to generate graphics
+ paused = false;
+ reading_list = false; // don't outdate the list while we read it (because reading it can trigger outdates!)
+
+ queue_size = null;
+ note_queue_size(x) { this.queue_size = x; }
+
+ pause(ms) {
+ this.paused = true;
+ if (!ms) ms = 100;
+ setTimeout( this.unpause.bind(this), ms );
+ }
+ unpause() {
+ this.paused = false;
+ app.graph.change();
+ }
+
+ // memory reuse
+ slot_pos1 = new Float32Array(2); //to reuse
+ slot_pos2 = new Float32Array(2); //to reuse
+
+ mark_link_list_outdated() {
+ if (UpdateBlocker.blocking()) return;
+ if (this.reading_list) return;
+ if (this.ue_list) {
+ this.ue_list = undefined;
+ this.request_link_list_update();
+ Logger.log(Logger.INFORMATION, "link_list marked outdated");
+ } else {
+ Logger.log(Logger.INFORMATION, "link_list was already outdated");
+ }
+ }
+
+ periodically_mark_link_list_outdated() {
+ this.mark_link_list_outdated();
+ setTimeout(this.periodically_mark_link_list_outdated.bind(this), 1000);
+ }
+
+ // callback when the_graph_analyser finishes - store the result and note reloading is false
+ reload_resolve = function (value) {
+ this.ue_list = value;
+ this.ue_list_reloading = false;
+ if (this.ue_list.differs_from(this.last_used_ue_list)) app.graph.change();
+ Logger.log(Logger.INFORMATION, "link list update completed");
+ Logger.log_call(Logger.DETAIL, this.ue_list.print_all);
+ }.bind(this)
+
+ // callback for when the_graph_analyser fails - note reloading is false and log
+ reload_reject = function(reason) {
+ this.ue_list_reloading=false;
+ Logger.log(Logger.ERROR, "link list update failed");
+ Logger.log_error(Logger.ERROR, reason);
+ }.bind(this)
+
+ // request an update to the ue_list.
+ request_link_list_update() {
+ if (this.ue_list_reloading) return; // already doing it
+ this.ue_list_reloading = true; // stop any more requests
+ this.the_graph_analyser.analyse_graph().then(this.reload_resolve, this.reload_reject); // an async call is a promise; pass it two callbacks
+ Logger.log(Logger.INFORMATION, "link list update started");
+ }
+
+ highlight_ue_connections(node, ctx) {
+ try {
+ this._highlight_ue_connections(node, ctx);
+ } catch (e) {
+ console.error(e);
+ }
+ }
+
+ _highlight_ue_connections(node, ctx) {
+ this.reading_list = true;
+ if (!app.ui.settings.getSettingValue('AE.highlight', true)) return;
+ //if (this._ue_links_visible) return;
+ if (!this.list_ready()) return;
+
+ if (this.ue_list.all_connected_inputs) {
+ this.ue_list.all_connected_inputs(node).forEach((ue_connection) => {
+ if (!ue_connection.control_node) { // control node deleted...
+ this.mark_link_list_outdated();
+ return;
+ }
+ var pos2 = node.getConnectionPos(true, ue_connection.input_index, this.slot_pos1);
+ pos2[0] -= node.pos[0];
+ pos2[1] -= node.pos[1];
+ ctx.save();
+ ctx.lineWidth = 1;
+ var radius=5
+ ctx.strokeStyle = LGraphCanvas.link_type_colors[ue_connection.type];
+ ctx.shadowColor = "white";
+ ctx.shadowBlur = 10;
+ ctx.shadowOffsetX = 0;
+ ctx.shadowOffsetY = 0;
+ ctx.beginPath();
+ ctx.roundRect(pos2[0]-radius,pos2[1]-radius,2*radius,2*radius,radius);
+ ctx.stroke();
+ ctx.beginPath();
+ ctx.strokeStyle = "black";
+ ctx.shadowBlur = 0;
+ radius = radius - 1;
+ ctx.roundRect(pos2[0]-radius,pos2[1]-radius,2*radius,2*radius,radius);
+ ctx.stroke();
+
+ ctx.restore();
+ });
+ }
+ this.reading_list = false;
+ }
+
+ list_ready(make_latest) {
+ if (this.paused) return false;
+ if (!this.the_graph_analyser) return false; // we don't have the analyser yet (still loading)
+ if (!this.ue_list) this.request_link_list_update();
+ if (!this.ue_list) return false;
+ if (make_latest) this.last_used_ue_list = this.ue_list;
+ return true;
+ }
+
+ node_in_ueconnection(ue_connection, id) {
+ if (ue_connection.control_node && get_group_node(ue_connection.control_node.id)?.id == id) return true
+ if (ue_connection.sending_to && get_group_node(ue_connection.sending_to.id)?.id == id) return true
+ }
+
+ any_node_in_ueconnection(ue_connection, list_of_nodes) {
+ return (Object.values(list_of_nodes).find((node) => (this.node_in_ueconnection(ue_connection, node.id))))?true:false;
+ }
+
+ render_all_ue_links(ctx) {
+ try {
+ this._render_all_ue_links(ctx);
+ } catch (e) {
+ console.error(e);
+ }
+ }
+
+ _render_all_ue_links(ctx) {
+ if (!this.list_ready(true)) return;
+
+ this.reading_list = true;
+ ctx.save();
+ const orig_hqr = app.canvas.highquality_render;
+ app.canvas.highquality_render = false;
+
+ const mode = app.ui.settings.getSettingValue('AE.showlinks', 0);
+ var animate = app.ui.settings.getSettingValue('AE.animate', 3);
+ if (app.ui.settings.getSettingValue('AE.stop.animation.running', true) && this.queue_size>0) animate = 0;
+ if (animate==2 || animate==3) this.animate_step(ctx);
+
+ var any_links_shown = false;
+ var any_links = false;
+
+ this.ue_list.all_ue_connections().forEach((ue_connection) => {
+ any_links = true;
+ var show = false;
+ if (mode==4) show = true;
+ if ( (mode==2 || mode==3) && app.canvas.node_over && this.node_in_ueconnection(ue_connection, app.canvas.node_over.id) ) show = true;
+ if ( (mode==1 || mode==3) && this.any_node_in_ueconnection(ue_connection, app.canvas.selected_nodes)) show = true;
+
+ if ( show ) {
+ this._render_ue_link(ue_connection, ctx, animate);
+ any_links_shown = true;
+ }
+ });
+
+
+ if (animate>0) {
+ /*
+ If animating, we want to mark the visuals as changed so the animation updates - but not often!
+ If links shown:
+ - If showing dots, wait 30ms
+ - Otherwise, wait 100ms
+ If no links are shown
+ - If there are links, and our mode is mouseover, wait 200ms
+ - Otherwise don't request an update (there are no links that could be shown without something else requesting a redraw)
+ */
+ const timeout = (any_links_shown) ? ((animate%2 == 1) ? 30 : 100) : ((mode==2 || mode==3) && any_links) ? 200 : -1;
+ if (timeout>0) setTimeout( app.graph.change.bind(app.graph), timeout );
+ }
+
+ app.canvas.highquality_render = orig_hqr;
+ ctx.restore();
+ this.reading_list = false;
+ }
+
+
+ _render_ue_link(ue_connection, ctx, animate) {
+ try {
+ const node = get_real_node(ue_connection.sending_to.id);
+
+ /* this is the end node; get the position of the input */
+ var pos2 = node.getConnectionPos(true, ue_connection.input_index, this.slot_pos1);
+
+ /* get the position of the *input* that is being echoed - except for the Seed Anywhere node,
+ which is displayed with an output: the class records control_node_input_index as -ve (-1 => 0, -2 => 1...) */
+ const input_source = (ue_connection.control_node_input_index >= 0);
+ const source_index = input_source ? ue_connection.control_node_input_index : -1-ue_connection.control_node_input_index;
+ const pos1 = get_group_node(ue_connection.control_node.id).getConnectionPos(input_source, source_index, this.slot_pos2);
+
+ /* get the direction that we start and end */
+ const delta_x = pos2[0] - pos1[0];
+ const delta_y = pos2[1] - pos1[1];
+ const end_direction = LiteGraph.LEFT; // always end going into an input
+ const sta_direction = ((Math.abs(delta_y) > Math.abs(delta_x))) ?
+ ((delta_y>0) ? LiteGraph.DOWN : LiteGraph.UP) :
+ ((input_source && delta_x<0) ? LiteGraph.LEFT : LiteGraph.RIGHT)
+
+ var color = LGraphCanvas.link_type_colors[ue_connection.type];
+ if (color=="") color = app.canvas.default_link_color;
+ ctx.shadowColor = color;
+
+ app.canvas.renderLink(ctx, pos1, pos2, undefined, true, animate%2, color, sta_direction, end_direction, undefined);
+ } catch (e) {
+ Logger.log(Logger.PROBLEM, `Couldn't render UE link ${ue_connection}. That's ok if something just got deleted.`);
+ }
+ }
+
+ animate_step(ctx) {
+ const max_blur = 8;
+ const speed = 0.75;
+ var f = (LiteGraph.getTime()*0.001*speed) % 1;
+ const step = Math.ceil(f*2*max_blur) % (2*max_blur);
+ ctx.shadowBlur = (step=Logger.INFORMATION) for (var i=0; i { // normal links
+ const link_id = input.link;
+ if (link_id) {
+ const link = app.graph.links[link_id];
+ if (link) all_upstream.push({id:link.origin_id, slot:link.origin_slot});
+ }
+ });
+ links_added.forEach((la)=>{ // UE links
+ if (get_real_node(la.downstream).id==node.id) {
+ all_upstream.push({id:la.upstream, slot:la.upstream_slot, ue:la.controller.toString()})
+ }
+ });
+ if (node.id != get_group_node(node.id).id) { // node is in group
+ const grp_nd = get_group_node(node.id).id;
+ const group_data = GroupNodeHandler.getGroupData(get_group_node(node.id));
+ const indx = group_data.nodeData.nodes.findIndex((n)=>n.pos[0]==node.pos[0] && n.pos[1]==node.pos[1]);
+ if (indx>=0) {
+ if (GroupNodeHandler.getGroupData(app.graph._nodes_by_id[grp_nd])?.linksTo?.[indx] ) { // links within group
+ Object.values(GroupNodeHandler.getGroupData(app.graph._nodes_by_id[grp_nd]).linksTo[indx]).forEach((internal_link) => {
+ all_upstream.push({id:`${grp_nd}:${internal_link[0]}`, slot:internal_link[1]});
+ });
+ }
+ if (GroupNodeHandler.getGroupData(app.graph._nodes_by_id[grp_nd]).oldToNewInputMap?.[indx]) { // links out of group
+ Object.values(GroupNodeHandler.getGroupData(app.graph._nodes_by_id[grp_nd]).oldToNewInputMap?.[indx]).forEach((groupInput) => {
+ const link_id = get_group_node(node.id).inputs?.[groupInput]?.link;
+ if (link_id) {
+ const link = app.graph.links[link_id];
+ if (link) all_upstream.push({id:link.origin_id, slot:link.origin_slot});
+ }
+ })
+ }
+ }
+ }
+ return all_upstream;
+}
+
+function recursive_follow(node_id, start_node_id, links_added, stack, nodes_cleared, ues, count, slot) {
+ const node = get_real_node(node_id);
+ if (slot>=0 && GroupNodeHandler.isGroupNode(node)) { // link into group
+ const mapped = GroupNodeHandler.getGroupData(node).newToOldOutputMap[slot];
+ return recursive_follow(`${node.id}:${mapped.node.index}`, start_node_id, links_added, stack, nodes_cleared, ues, count, mapped.slot);
+ }
+ count += 1;
+ if (stack.includes(node.id.toString())) throw new LoopError(node.id, new Set(stack), new Set(ues));
+ if (nodes_cleared.has(node.id.toString())) return;
+ stack.push(node.id.toString());
+
+ find_all_upstream(node.id, links_added).forEach((upstream) => {
+ if (upstream.ue) ues.push(upstream.ue);
+ count = recursive_follow(upstream.id, start_node_id, links_added, stack, nodes_cleared, ues, count, upstream.slot);
+ if (upstream.ue) ues.pop();
+ })
+
+ nodes_cleared.add(node.id.toString());
+ stack.pop();
+ return count;
+}
+
+/*
+Throw a LoopError if there is a loop.
+live_nodes is a list of all live (ie not bypassed) nodes in the graph
+links_added is a list of the UE virtuals links
+*/
+function node_in_loop(live_nodes, links_added) {
+ var nodes_to_check = [];
+ const nodes_cleared = new Set();
+ live_nodes.forEach((n)=>nodes_to_check.push(get_real_node(n.id).id));
+ var count = 0;
+ while (nodes_to_check.length>0) {
+ const node_id = nodes_to_check.pop();
+ count += recursive_follow(node_id, node_id, links_added, [], nodes_cleared, [], 0, -1);
+ nodes_to_check = nodes_to_check.filter((nid)=>!nodes_cleared.has(nid.toString()));
+ }
+ console.log(`node_in_loop made ${count} checks`)
+}
+
+/*
+Is a node alive (ie not bypassed or set to never)
+*/
+function node_is_live(node){
+ if (!node) return false;
+ if (node.mode===0) return true;
+ if (node.mode===2 || node.mode===4) return false;
+ Logger.log(Logger.ERROR, `node ${node.id} has mode ${node.mode} - I only understand modes 0, 2 and 4`);
+ return true;
+}
+
+function node_is_bypassed(node) {
+ return (node.mode===4);
+}
+
+/*
+Given a link object, and the type of the link,
+go upstream, following links with the same type, until you find a parent node which isn't bypassed.
+If either type or original link is null, or if the upstream thread ends, return null
+*/
+function handle_bypass(original_link, type) {
+ if (!type || !original_link) return null;
+ var link = original_link;
+ var parent = get_real_node(link.origin_id);
+ if (!parent) return null;
+ while (node_is_bypassed(parent)) {
+ if (!parent.inputs) return null;
+ var link_id;
+ if (parent?.inputs[link.origin_slot]?.type == type) link_id = parent.inputs[link.origin_slot].link; // try matching number first
+ else link_id = parent.inputs.find((input)=>input.type==type)?.link;
+ if (!link_id) { return null; }
+ link = app.graph.links[link_id];
+ parent = get_real_node(link.origin_id);
+ }
+ return link;
+}
+
+function all_group_nodes() {
+ return app.graph._nodes.filter((node) => GroupNodeHandler.isGroupNode(node));
+}
+
+function is_in_group(node_id, group_node) {
+ return group_node.getInnerNodes().find((inner_node) => (inner_node.id==node_id));
+}
+
+/*
+Return the group node if this node_id is part of a group, else return the node itself.
+Returns a full node object
+*/
+function get_group_node(node_id, level=Logger.ERROR) {
+ const nid = node_id.toString();
+ var gn = app.graph._nodes_by_id[nid];
+ if (!gn && nid.includes(':')) gn = app.graph._nodes_by_id[nid.split(':')[0]];
+ if (!gn) gn = all_group_nodes().find((group_node) => is_in_group(nid, group_node));
+ if (!gn) Logger.log(level, `get_group node couldn't find ${nid}`)
+ return gn;
+}
+
+/*
+Return the node object for this node_id.
+- if it's in _nodes_by_id return it
+- if it is of the form x:y find it in group node x
+- if it is the real node number of something in a group, get it from the group
+*/
+function get_real_node(node_id, level=Logger.INFORMATION) {
+ const nid = node_id.toString();
+ var rn = app.graph._nodes_by_id[nid];
+ if (!rn && nid.includes(':')) rn = app.graph._nodes_by_id[nid.split(':')[0]]?.getInnerNodes()[nid.split(':')[1]]
+ if (!rn) {
+ all_group_nodes().forEach((node) => {
+ if (!rn) rn = node.getInnerNodes().find((inner_node) => (inner_node.id==nid));
+ })
+ }
+ if (!rn) Logger.log(level, `get_real_node couldn't find ${node_id} - ok during loading, shortly after node deletion etc.`)
+ return rn;
+}
+
+function get_all_nodes_within(node_id) {
+ const node = get_group_node(node_id);
+ if (GroupNodeHandler.isGroupNode(node)) return node.getInnerNodes();
+ return [];
+}
+
+
+/*
+Does this input connect upstream to a live node?
+*/
+function is_connected(input) {
+ const link_id = input.link;
+ if (link_id === null) return false; // no connection
+ var the_link = app.graph.links[link_id];
+ if (!the_link) return false;
+ the_link = handle_bypass(the_link, the_link.type); // find the link upstream of bypasses
+ if (!the_link) return false; // no source for data.
+ return true;
+}
+
+/*
+Is this a UE node?
+*/
+function is_UEnode(node_or_nodeType) {
+ const title = node_or_nodeType.type ?? node_or_nodeType.comfyClass;
+ return ((title) && (title.startsWith("Anything Everywhere") || title==="Seed Everywhere" || title==="Prompts Everywhere"))
+}
+function is_helper(node_or_nodeType) {
+ const title = node_or_nodeType.type ?? node_or_nodeType.comfyClass;
+ return ((title) && (title.startsWith("Simple String")))
+}
+function has_priority_boost(node_or_nodeType) {
+ const title = node_or_nodeType.type ?? node_or_nodeType.comfyClass;
+ return ((title) && (title == "Anything Everywhere?"))
+}
+
+/*
+Inject a call into a method on object with name methodname.
+The injection is added at the end of the existing method (if the method didn't exist, it is created)
+injectionthis and injectionarguments are passed into the apply call (as the this and the arguments)
+*/
+function inject(object, methodname, tracetext, injection, injectionthis, injectionarguments) {
+ const original = object[methodname];
+ object[methodname] = function() {
+ Logger.trace(`${tracetext} hijack`, arguments);
+ original?.apply(this, arguments);
+ injection.apply(injectionthis, injectionarguments);
+ }
+}
+
+
+export { node_in_loop, handle_bypass, node_is_live, is_connected, is_UEnode, is_helper, inject, Logger, get_real_node, get_group_node, get_all_nodes_within, has_priority_boost}
+
+export function defineProperty(instance, property, desc) {
+ const existingDesc = Object.getOwnPropertyDescriptor(instance, property);
+ if (existingDesc?.configurable === false) {
+ throw new Error(`Error: Cannot define un-configurable property "${property}"`);
+ }
+ if (existingDesc?.get && desc.get) {
+ const descGet = desc.get;
+ desc.get = () => {
+ existingDesc.get.apply(instance, []);
+ return descGet.apply(instance, []);
+ };
+ }
+ if (existingDesc?.set && desc.set) {
+ const descSet = desc.set;
+ desc.set = (v) => {
+ existingDesc.set.apply(instance, [v]);
+ return descSet.apply(instance, [v]);
+ };
+ }
+ desc.enumerable = desc.enumerable ?? existingDesc?.enumerable ?? true;
+ desc.configurable = desc.configurable ?? existingDesc?.configurable ?? true;
+ if (!desc.get && !desc.set) {
+ desc.writable = desc.writable ?? existingDesc?.writable ?? true;
+ }
+ return Object.defineProperty(instance, property, desc);
+ }
\ No newline at end of file
diff --git a/cg-use-everywhere/pyproject.toml b/cg-use-everywhere/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..6e81f148996869d90cae87c767c1e3b797cb3f81
--- /dev/null
+++ b/cg-use-everywhere/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "cg-use-everywhere"
+description = "A set of nodes that allow data to be 'broadcast' to some or all unconnected inputs. Greatly reduces link spaghetti."
+version = "5.0.6"
+license = { file = "LICENSE" }
+
+[project.urls]
+Repository = "https://github.com/chrisgoringe/cg-use-everywhere"
+
+[tool.comfy]
+PublisherId = "chrisgoringe"
+DisplayName = "cg-use-everywhere"
+Icon = ""
diff --git a/cg-use-everywhere/tests/compare.png b/cg-use-everywhere/tests/compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..9e374d06099a9fa3085b1411a88a978b5cae9ce6
Binary files /dev/null and b/cg-use-everywhere/tests/compare.png differ
diff --git a/cg-use-everywhere/tests/test.md b/cg-use-everywhere/tests/test.md
new file mode 100644
index 0000000000000000000000000000000000000000..2db2a4211aa7526f6612dc90b64e48d99d6d9227
--- /dev/null
+++ b/cg-use-everywhere/tests/test.md
@@ -0,0 +1,3 @@
+# Testing
+
+Any image in this folder should have it's workflow saved with it, and that workflow should generate the same image.
\ No newline at end of file
diff --git a/cg-use-everywhere/tests/test.png b/cg-use-everywhere/tests/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..b642916dcbefde67253b7b0c17c205b10ed023dc
Binary files /dev/null and b/cg-use-everywhere/tests/test.png differ
diff --git a/cg-use-everywhere/tests/test2.png b/cg-use-everywhere/tests/test2.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ddd6017ef71bbecab8412bd4a2cd17ae59ac0ec
Binary files /dev/null and b/cg-use-everywhere/tests/test2.png differ
diff --git a/cg-use-everywhere/use_everywhere.py b/cg-use-everywhere/use_everywhere.py
new file mode 100644
index 0000000000000000000000000000000000000000..ded6ef75f2f3711b1e32aa4a281461e02600401f
--- /dev/null
+++ b/cg-use-everywhere/use_everywhere.py
@@ -0,0 +1,86 @@
+from server import PromptServer
+import torch
+
+def message(id,message):
+ if isinstance(message, torch.Tensor):
+ string = f"Tensor shape {message.shape}"
+ elif isinstance(message, dict) and "samples" in message and isinstance(message["samples"], torch.Tensor):
+ string = f"Latent shape {message['samples'].shape}"
+ else:
+ string = f"{message}"
+ PromptServer.instance.send_sync("ue-message-handler", {"id": id, "message":string})
+
+class Base():
+ OUTPUT_NODE = True
+ FUNCTION = "func"
+ CATEGORY = "everywhere"
+ RETURN_TYPES = ()
+
+class SimpleString(Base):
+ OUTPUT_NODE = False
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{ "string": ("STRING", {"default": ""}) }}
+ RETURN_TYPES = ("STRING",)
+
+ def func(self,string):
+ return (string,)
+
+class SeedEverywhere(Base):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{ "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}) },
+ "hidden": {"id":"UNIQUE_ID"} }
+
+ RETURN_TYPES = ("INT",)
+
+ def func(self, seed, id):
+ message(id, seed)
+ return (seed,)
+
+class AnythingEverywhere(Base):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{},
+ "optional": { "anything" : ("*", {}), },
+ "hidden": {"id":"UNIQUE_ID"} }
+
+ def func(self, id, **kwargs):
+ for key in kwargs:
+ message(id, kwargs[key],)
+ return ()
+
+class AnythingEverywherePrompts(Base):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{},
+ "optional": { "+ve" : ("*", {}), "-ve" : ("*", {}), } }
+
+ def func(self, **kwargs):
+ return ()
+
+class AnythingEverywhereTriplet(Base):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{},
+ "optional": { "anything" : ("*", {}), "anything2" : ("*", {}), "anything3" : ("*", {}),} }
+
+ def func(self, **kwargs):
+ return ()
+
+class AnythingSomewhere(Base):
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required":{},
+ "optional": {
+ "anything" : ("*", {}),
+ "title_regex" : ("STRING", {"default":".*"}),
+ "input_regex" : ("STRING", {"default":".*"}),
+ "group_regex" : ("STRING", {"default":".*"}),
+ },
+ "hidden": {"id":"UNIQUE_ID"} }
+
+ def func(self, id, title_regex=None, input_regex=None, group_regex=None, **kwargs):
+ for key in kwargs:
+ message(id, kwargs[key],)
+ return ()
diff --git a/cg-use-everywhere/workflow_fixer.py b/cg-use-everywhere/workflow_fixer.py
new file mode 100644
index 0000000000000000000000000000000000000000..48812442387d182efb2662764a7effe9dea16131
--- /dev/null
+++ b/cg-use-everywhere/workflow_fixer.py
@@ -0,0 +1,22 @@
+import json, sys
+
+INFO = '''
+If you saved a json workflow using 'Anything Everywhere?' nodes before the third regex was added, then you may find that when you load it, the Group Regex widget doesn't correctly default to '.*'.
+
+If so, run python workflow_fixer.py filename.json newname.json to fix it.
+'''
+
+def convert(oldname, newname):
+ with open(oldname) as f: workflow = json.load(f)
+ for node in workflow['nodes']:
+ if node['type'] == "Anything Everywhere?":
+ print(f"Fixing {node['title'] if 'title' in node else 'Untitled AE? node'}...")
+ node['widgets_values'][2] = '.*'
+ with open(newname,'w') as f: print(json.dumps(workflow, indent=2), file=f)
+
+if __name__=='__main__':
+ if len(sys.argv)!=3:
+ print(INFO)
+ else:
+ convert(sys.argv[1], sys.argv[2])
+
diff --git a/comfy-image-saver/.gitignore b/comfy-image-saver/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0041fb8522b079ede21e882fedc86d1e34f92caf
--- /dev/null
+++ b/comfy-image-saver/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/comfy-image-saver/LICENSE b/comfy-image-saver/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..7753ff236969626f6700a2f995936b04150be817
--- /dev/null
+++ b/comfy-image-saver/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Girish Gopaul
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/comfy-image-saver/README.md b/comfy-image-saver/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f595e08d594660cde89807d492109a4535b4f50a
--- /dev/null
+++ b/comfy-image-saver/README.md
@@ -0,0 +1,27 @@
+# Save image with generation metadata on ComfyUI
+
+All the tools you need to save images with their **generation metadata** on ComfyUI. Compatible with *Civitai* & *Prompthero* geninfo auto-detection. Works with `png`, `jpeg` and `webp`.
+
+You can find the example workflow file named `example-workflow.json`.
+
+![example-workflow](https://github.com/giriss/comfy-image-saver/assets/2811408/e231237b-f91a-4679-b3ae-2618080c8e39)
+
+## How to install?
+
+### Method 1: Easiest (Recommended)
+If you have *ComfyUI-Manager*, you can simply search "**Save Image with Generation Metadata**" and install these custom nodes 🎉
+
+
+### Method 2: Easy
+If you don't have *ComfyUI-Manager*, then:
+- Using CLI, go to the ComfyUI folder
+- `cd custom_nodes`
+- `git clone git@github.com:giriss/comfy-image-saver.git`
+- `cd comfy-image-saver`
+- `pip install -r requirements.txt`
+- Start/restart ComfyUI 🎉
+
+## Autodetection in action
+
+![Screenshot 2023-08-17 at 13 15 18](https://github.com/giriss/comfy-image-saver/assets/2811408/785f2475-8f9a-45c9-9d38-855161a98495)
+
diff --git a/comfy-image-saver/__init__.py b/comfy-image-saver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca0d8c3e99cf8ea6231871a235ac28d956a838c
--- /dev/null
+++ b/comfy-image-saver/__init__.py
@@ -0,0 +1,3 @@
+from .nodes import NODE_CLASS_MAPPINGS
+
+__all__ = ['NODE_CLASS_MAPPINGS']
diff --git a/comfy-image-saver/__pycache__/__init__.cpython-312.pyc b/comfy-image-saver/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d68f764113e5ad1e669bd17654c3763932a57634
Binary files /dev/null and b/comfy-image-saver/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfy-image-saver/__pycache__/nodes.cpython-312.pyc b/comfy-image-saver/__pycache__/nodes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afba4a64f7d3a7d6edba8c4f98afbf49324d54cd
Binary files /dev/null and b/comfy-image-saver/__pycache__/nodes.cpython-312.pyc differ
diff --git a/comfy-image-saver/example-workflow.json b/comfy-image-saver/example-workflow.json
new file mode 100644
index 0000000000000000000000000000000000000000..20024c2909239d01ae33d6cf19d190d60133fcb8
--- /dev/null
+++ b/comfy-image-saver/example-workflow.json
@@ -0,0 +1,1295 @@
+{
+ "last_node_id": 40,
+ "last_link_id": 48,
+ "nodes": [
+ {
+ "id": 28,
+ "type": "Sampler Selector",
+ "pos": [
+ 548,
+ -10
+ ],
+ "size": {
+ "0": 263.46875,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "sampler_name",
+ "type": [
+ "euler",
+ "euler_ancestral",
+ "heun",
+ "dpm_2",
+ "dpm_2_ancestral",
+ "lms",
+ "dpm_fast",
+ "dpm_adaptive",
+ "dpmpp_2s_ancestral",
+ "dpmpp_sde",
+ "dpmpp_sde_gpu",
+ "dpmpp_2m",
+ "dpmpp_2m_sde",
+ "dpmpp_2m_sde_gpu",
+ "dpmpp_3m_sde",
+ "dpmpp_3m_sde_gpu",
+ "ddim",
+ "uni_pc",
+ "uni_pc_bh2"
+ ],
+ "links": [
+ 20,
+ 41
+ ],
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Sampler Selector"
+ },
+ "widgets_values": [
+ "euler_ancestral"
+ ]
+ },
+ {
+ "id": 29,
+ "type": "Int Literal",
+ "pos": [
+ 608,
+ 33
+ ],
+ "size": {
+ "0": 210,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 21,
+ 38
+ ],
+ "shape": 3
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "Node name for S&R": "Int Literal"
+ },
+ "widgets_values": [
+ 20
+ ]
+ },
+ {
+ "id": 30,
+ "type": "Cfg Literal",
+ "pos": [
+ 586,
+ 73
+ ],
+ "size": {
+ "0": 210,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 22,
+ 39
+ ],
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Cfg Literal"
+ },
+ "widgets_values": [
+ 7
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 863,
+ 186
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 1
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 4
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 6
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "link": 15,
+ "widget": {
+ "name": "seed",
+ "config": [
+ "INT",
+ {
+ "default": 0,
+ "min": 0,
+ "max": 18446744073709552000
+ }
+ ]
+ }
+ },
+ {
+ "name": "sampler_name",
+ "type": "euler,euler_ancestral,heun,dpm_2,dpm_2_ancestral,lms,dpm_fast,dpm_adaptive,dpmpp_2s_ancestral,dpmpp_sde,dpmpp_sde_gpu,dpmpp_2m,dpmpp_2m_sde,dpmpp_2m_sde_gpu,dpmpp_3m_sde,dpmpp_3m_sde_gpu,ddim,uni_pc,uni_pc_bh2",
+ "link": 20,
+ "widget": {
+ "name": "sampler_name",
+ "config": [
+ [
+ "euler",
+ "euler_ancestral",
+ "heun",
+ "dpm_2",
+ "dpm_2_ancestral",
+ "lms",
+ "dpm_fast",
+ "dpm_adaptive",
+ "dpmpp_2s_ancestral",
+ "dpmpp_sde",
+ "dpmpp_sde_gpu",
+ "dpmpp_2m",
+ "dpmpp_2m_sde",
+ "dpmpp_2m_sde_gpu",
+ "dpmpp_3m_sde",
+ "dpmpp_3m_sde_gpu",
+ "ddim",
+ "uni_pc",
+ "uni_pc_bh2"
+ ]
+ ]
+ },
+ "slot_index": 5
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "link": 21,
+ "widget": {
+ "name": "steps",
+ "config": [
+ "INT",
+ {
+ "default": 20,
+ "min": 1,
+ "max": 10000
+ }
+ ]
+ },
+ "slot_index": 6
+ },
+ {
+ "name": "cfg",
+ "type": "FLOAT",
+ "link": 22,
+ "widget": {
+ "name": "cfg",
+ "config": [
+ "FLOAT",
+ {
+ "default": 8,
+ "min": 0,
+ "max": 100
+ }
+ ]
+ },
+ "slot_index": 7
+ },
+ {
+ "name": "scheduler",
+ "type": "normal,karras,exponential,sgm_uniform,simple,ddim_uniform",
+ "link": 23,
+ "widget": {
+ "name": "scheduler",
+ "config": [
+ [
+ "normal",
+ "karras",
+ "exponential",
+ "sgm_uniform",
+ "simple",
+ "ddim_uniform"
+ ]
+ ]
+ },
+ "slot_index": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 72524088949694,
+ "randomize",
+ 20,
+ 7,
+ "dpmpp_sde",
+ "karras",
+ 1
+ ]
+ },
+ {
+ "id": 31,
+ "type": "Scheduler Selector",
+ "pos": [
+ 539,
+ 113
+ ],
+ "size": {
+ "0": 210,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "scheduler",
+ "type": [
+ "normal",
+ "karras",
+ "exponential",
+ "sgm_uniform",
+ "simple",
+ "ddim_uniform"
+ ],
+ "links": [
+ 23,
+ 42
+ ],
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Scheduler Selector"
+ },
+ "widgets_values": [
+ "normal"
+ ]
+ },
+ {
+ "id": 7,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 580,
+ 345
+ ],
+ "size": {
+ "0": 425.27801513671875,
+ "1": 180.6060791015625
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 5
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 17,
+ "widget": {
+ "name": "text",
+ "config": [
+ "STRING",
+ {
+ "multiline": true
+ }
+ ]
+ },
+ "slot_index": 1
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 6
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature, multiple people, 2 or more people, more than 1 person"
+ ]
+ },
+ {
+ "id": 6,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 583,
+ 293
+ ],
+ "size": {
+ "0": 422.84503173828125,
+ "1": 164.31304931640625
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 3
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 16,
+ "widget": {
+ "name": "text",
+ "config": [
+ "STRING",
+ {
+ "multiline": true
+ }
+ ]
+ },
+ "slot_index": 1
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 4
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "Sexy girl wandering alone, Lost in thought, scenic advanced alien mega city, seeking solace, wearing sexy deep pink bra, Finding peace within | centered | stunning visual | intricate | highly detailed| breathtaking beauty| precise lineart| vibrant| comprehensive cinematic| anna dittman, full perfect body, dynamic pose, best quality, 8k, clean focus, carne griffths, beautiful lighting, 1 person, close up portrait, hyperrealistic, hyperrealism, full body view, necklace, daylight"
+ ]
+ },
+ {
+ "id": 17,
+ "type": "Seed Generator",
+ "pos": [
+ 551,
+ -54
+ ],
+ "size": {
+ "0": 275.2265625,
+ "1": 82
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 15,
+ 43
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Seed Generator"
+ },
+ "widgets_values": [
+ 479566252427468,
+ "randomize"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 164,
+ 478
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "ckpt_name",
+ "type": "epicrealism_pureEvolutionV5.safetensors",
+ "link": 28,
+ "widget": {
+ "name": "ckpt_name",
+ "config": [
+ [
+ "epicrealism_pureEvolutionV5.safetensors"
+ ]
+ ]
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 3,
+ 5
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "epicrealism_pureEvolutionV5.safetensors"
+ ]
+ },
+ {
+ "id": 22,
+ "type": "String Literal",
+ "pos": [
+ 79,
+ 295
+ ],
+ "size": {
+ "0": 400.109375,
+ "1": 108.55078125
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 17,
+ 45
+ ],
+ "shape": 3
+ }
+ ],
+ "title": "Negative",
+ "properties": {
+ "Node name for S&R": "String Literal"
+ },
+ "widgets_values": [
+ "(worst quality, low quality, illustration, 3d, 2d), open mouth, tooth,ugly face, old face, abnormal hands, watermark, abnormal fingers, extra limbs, ugly eyes, ugly face,"
+ ],
+ "color": "#233",
+ "bgcolor": "#355"
+ },
+ {
+ "id": 19,
+ "type": "String Literal",
+ "pos": [
+ 79,
+ 53
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 16,
+ 44
+ ],
+ "shape": 3
+ }
+ ],
+ "title": "Positive",
+ "properties": {
+ "Node name for S&R": "String Literal"
+ },
+ "widgets_values": [
+ "masterpiece,best quality, renaissance style girl,dark Silver Long waves hair,Bare shoulder,beautiful Bollywood actresse, light skin,DayGlo green translucent saree sari, perty, beauty face, . realistic, perspective, light and shadow, religious or mythological themes, highly detailed, a beautiful painting of the pinnacles, domes and towers of the ancient mayan jungle city, with the night sky with stars above, intricate, elegant, highly detailed, digital painting, artstation, concept art, by krenz cushart and artem demura and alphonse mucha, (colorful) by james jean and by artgerm, by ross tran, ultradetailed, charachter design, concept art, trending on artstation"
+ ],
+ "color": "#233",
+ "bgcolor": "#355"
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 650,
+ 446
+ ],
+ "size": {
+ "0": 210,
+ "1": 78
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 18,
+ "widget": {
+ "name": "width",
+ "config": [
+ "INT",
+ {
+ "default": 512,
+ "min": 64,
+ "max": 8192,
+ "step": 8
+ }
+ ]
+ },
+ "slot_index": 0
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 19,
+ "widget": {
+ "name": "height",
+ "config": [
+ "INT",
+ {
+ "default": 512,
+ "min": 64,
+ "max": 8192,
+ "step": 8
+ }
+ ]
+ },
+ "slot_index": 1
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 512,
+ 768,
+ 1
+ ]
+ },
+ {
+ "id": 27,
+ "type": "Width/Height Literal",
+ "pos": [
+ 506,
+ 406
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 7,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 18,
+ 46
+ ],
+ "shape": 3
+ }
+ ],
+ "title": "Width",
+ "properties": {
+ "Node name for S&R": "Width/Height Literal"
+ },
+ "widgets_values": [
+ 768
+ ]
+ },
+ {
+ "id": 26,
+ "type": "Width/Height Literal",
+ "pos": [
+ 506,
+ 488
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 8,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 19,
+ 47
+ ],
+ "shape": 3
+ }
+ ],
+ "title": "Height",
+ "properties": {
+ "Node name for S&R": "Width/Height Literal"
+ },
+ "widgets_values": [
+ 1152
+ ]
+ },
+ {
+ "id": 35,
+ "type": "Checkpoint Selector",
+ "pos": [
+ 499,
+ -155
+ ],
+ "size": [
+ 382.49933725757705,
+ 58
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "ckpt_name",
+ "type": [
+ "epicrealism_pureEvolutionV5.safetensors"
+ ],
+ "links": [
+ 28,
+ 40
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Checkpoint Selector"
+ },
+ "widgets_values": [
+ "epicrealism_pureEvolutionV5.safetensors"
+ ]
+ },
+ {
+ "id": 39,
+ "type": "Save Image w/Metadata",
+ "pos": [
+ 1222,
+ -154
+ ],
+ "size": [
+ 349.01775559535145,
+ 646.6896077006018
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 48
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "link": 38,
+ "widget": {
+ "name": "steps",
+ "config": [
+ "INT",
+ {
+ "default": 20,
+ "min": 1,
+ "max": 10000
+ }
+ ]
+ },
+ "slot_index": 1
+ },
+ {
+ "name": "cfg",
+ "type": "FLOAT",
+ "link": 39,
+ "widget": {
+ "name": "cfg",
+ "config": [
+ "FLOAT",
+ {
+ "default": 8,
+ "min": 0,
+ "max": 100
+ }
+ ]
+ },
+ "slot_index": 2
+ },
+ {
+ "name": "modelname",
+ "type": "epicrealism_pureEvolutionV5.safetensors",
+ "link": 40,
+ "widget": {
+ "name": "modelname",
+ "config": [
+ [
+ "epicrealism_pureEvolutionV5.safetensors"
+ ]
+ ]
+ },
+ "slot_index": 3
+ },
+ {
+ "name": "sampler_name",
+ "type": "euler,euler_ancestral,heun,dpm_2,dpm_2_ancestral,lms,dpm_fast,dpm_adaptive,dpmpp_2s_ancestral,dpmpp_sde,dpmpp_sde_gpu,dpmpp_2m,dpmpp_2m_sde,dpmpp_2m_sde_gpu,dpmpp_3m_sde,dpmpp_3m_sde_gpu,ddim,uni_pc,uni_pc_bh2",
+ "link": 41,
+ "widget": {
+ "name": "sampler_name",
+ "config": [
+ [
+ "euler",
+ "euler_ancestral",
+ "heun",
+ "dpm_2",
+ "dpm_2_ancestral",
+ "lms",
+ "dpm_fast",
+ "dpm_adaptive",
+ "dpmpp_2s_ancestral",
+ "dpmpp_sde",
+ "dpmpp_sde_gpu",
+ "dpmpp_2m",
+ "dpmpp_2m_sde",
+ "dpmpp_2m_sde_gpu",
+ "dpmpp_3m_sde",
+ "dpmpp_3m_sde_gpu",
+ "ddim",
+ "uni_pc",
+ "uni_pc_bh2"
+ ]
+ ]
+ },
+ "slot_index": 4
+ },
+ {
+ "name": "scheduler",
+ "type": "normal,karras,exponential,sgm_uniform,simple,ddim_uniform",
+ "link": 42,
+ "widget": {
+ "name": "scheduler",
+ "config": [
+ [
+ "normal",
+ "karras",
+ "exponential",
+ "sgm_uniform",
+ "simple",
+ "ddim_uniform"
+ ]
+ ]
+ },
+ "slot_index": 5
+ },
+ {
+ "name": "positive",
+ "type": "STRING",
+ "link": 44,
+ "widget": {
+ "name": "positive",
+ "config": [
+ "STRING",
+ {
+ "default": "unknown",
+ "multiline": true
+ }
+ ]
+ },
+ "slot_index": 6
+ },
+ {
+ "name": "negative",
+ "type": "STRING",
+ "link": 45,
+ "widget": {
+ "name": "negative",
+ "config": [
+ "STRING",
+ {
+ "default": "unknown",
+ "multiline": true
+ }
+ ]
+ },
+ "slot_index": 7
+ },
+ {
+ "name": "seed_value",
+ "type": "INT",
+ "link": 43,
+ "widget": {
+ "name": "seed_value",
+ "config": [
+ "INT",
+ {
+ "default": 0,
+ "min": 0,
+ "max": 18446744073709552000
+ }
+ ]
+ },
+ "slot_index": 8
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 46,
+ "widget": {
+ "name": "width",
+ "config": [
+ "INT",
+ {
+ "default": 512,
+ "min": 1,
+ "max": 8192,
+ "step": 8
+ }
+ ]
+ },
+ "slot_index": 9
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 47,
+ "widget": {
+ "name": "height",
+ "config": [
+ "INT",
+ {
+ "default": 512,
+ "min": 1,
+ "max": 8192,
+ "step": 8
+ }
+ ]
+ },
+ "slot_index": 10
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Save Image w/Metadata"
+ },
+ "widgets_values": [
+ "%time_%seed",
+ "",
+ "jpeg",
+ 20,
+ 8,
+ "epicrealism_pureEvolutionV5.safetensors",
+ "euler",
+ "normal",
+ "unknown",
+ "unknown",
+ 0,
+ 512,
+ 512,
+ true,
+ 100,
+ 0,
+ "%Y-%m-%d-%H%M%S"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 971,
+ -153
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 48
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ }
+ ],
+ "links": [
+ [
+ 1,
+ 4,
+ 0,
+ 3,
+ 0,
+ "MODEL"
+ ],
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ "LATENT"
+ ],
+ [
+ 3,
+ 4,
+ 1,
+ 6,
+ 0,
+ "CLIP"
+ ],
+ [
+ 4,
+ 6,
+ 0,
+ 3,
+ 1,
+ "CONDITIONING"
+ ],
+ [
+ 5,
+ 4,
+ 1,
+ 7,
+ 0,
+ "CLIP"
+ ],
+ [
+ 6,
+ 7,
+ 0,
+ 3,
+ 2,
+ "CONDITIONING"
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ "LATENT"
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 15,
+ 17,
+ 0,
+ 3,
+ 4,
+ "INT"
+ ],
+ [
+ 16,
+ 19,
+ 0,
+ 6,
+ 1,
+ "STRING"
+ ],
+ [
+ 17,
+ 22,
+ 0,
+ 7,
+ 1,
+ "STRING"
+ ],
+ [
+ 18,
+ 27,
+ 0,
+ 5,
+ 0,
+ "INT"
+ ],
+ [
+ 19,
+ 26,
+ 0,
+ 5,
+ 1,
+ "INT"
+ ],
+ [
+ 20,
+ 28,
+ 0,
+ 3,
+ 5,
+ "euler,euler_ancestral,heun,dpm_2,dpm_2_ancestral,lms,dpm_fast,dpm_adaptive,dpmpp_2s_ancestral,dpmpp_sde,dpmpp_sde_gpu,dpmpp_2m,dpmpp_2m_sde,dpmpp_2m_sde_gpu,dpmpp_3m_sde,dpmpp_3m_sde_gpu,ddim,uni_pc,uni_pc_bh2"
+ ],
+ [
+ 21,
+ 29,
+ 0,
+ 3,
+ 6,
+ "INT"
+ ],
+ [
+ 22,
+ 30,
+ 0,
+ 3,
+ 7,
+ "FLOAT"
+ ],
+ [
+ 23,
+ 31,
+ 0,
+ 3,
+ 8,
+ "normal,karras,exponential,sgm_uniform,simple,ddim_uniform"
+ ],
+ [
+ 28,
+ 35,
+ 0,
+ 4,
+ 0,
+ "epicrealism_pureEvolutionV5.safetensors"
+ ],
+ [
+ 38,
+ 29,
+ 0,
+ 39,
+ 1,
+ "INT"
+ ],
+ [
+ 39,
+ 30,
+ 0,
+ 39,
+ 2,
+ "FLOAT"
+ ],
+ [
+ 40,
+ 35,
+ 0,
+ 39,
+ 3,
+ "epicrealism_pureEvolutionV5.safetensors"
+ ],
+ [
+ 41,
+ 28,
+ 0,
+ 39,
+ 4,
+ "euler,euler_ancestral,heun,dpm_2,dpm_2_ancestral,lms,dpm_fast,dpm_adaptive,dpmpp_2s_ancestral,dpmpp_sde,dpmpp_sde_gpu,dpmpp_2m,dpmpp_2m_sde,dpmpp_2m_sde_gpu,dpmpp_3m_sde,dpmpp_3m_sde_gpu,ddim,uni_pc,uni_pc_bh2"
+ ],
+ [
+ 42,
+ 31,
+ 0,
+ 39,
+ 5,
+ "normal,karras,exponential,sgm_uniform,simple,ddim_uniform"
+ ],
+ [
+ 43,
+ 17,
+ 0,
+ 39,
+ 8,
+ "INT"
+ ],
+ [
+ 44,
+ 19,
+ 0,
+ 39,
+ 6,
+ "STRING"
+ ],
+ [
+ 45,
+ 22,
+ 0,
+ 39,
+ 7,
+ "STRING"
+ ],
+ [
+ 46,
+ 27,
+ 0,
+ 39,
+ 9,
+ "INT"
+ ],
+ [
+ 47,
+ 26,
+ 0,
+ 39,
+ 10,
+ "INT"
+ ],
+ [
+ 48,
+ 8,
+ 0,
+ 39,
+ 0,
+ "IMAGE"
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
+}
\ No newline at end of file
diff --git a/comfy-image-saver/nodes.py b/comfy-image-saver/nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c8cc0c55bbbfad70b97aa8c2104d78421294de2
--- /dev/null
+++ b/comfy-image-saver/nodes.py
@@ -0,0 +1,279 @@
+import os
+import hashlib
+from datetime import datetime
+import json
+import piexif
+import piexif.helper
+from PIL import Image, ExifTags
+from PIL.PngImagePlugin import PngInfo
+import numpy as np
+import folder_paths
+import comfy.sd
+from nodes import MAX_RESOLUTION
+
+
+def parse_name(ckpt_name):
+ path = ckpt_name
+ filename = path.split("/")[-1]
+ filename = filename.split(".")[:-1]
+ filename = ".".join(filename)
+ return filename
+
+
+def calculate_sha256(file_path):
+ sha256_hash = hashlib.sha256()
+
+ with open(file_path, "rb") as f:
+ # Read the file in chunks to avoid loading the entire file into memory
+ for byte_block in iter(lambda: f.read(4096), b""):
+ sha256_hash.update(byte_block)
+
+ return sha256_hash.hexdigest()
+
+
+def handle_whitespace(string: str):
+ return string.strip().replace("\n", " ").replace("\r", " ").replace("\t", " ")
+
+
+def get_timestamp(time_format):
+ now = datetime.now()
+ try:
+ timestamp = now.strftime(time_format)
+ except:
+ timestamp = now.strftime("%Y-%m-%d-%H%M%S")
+
+ return timestamp
+
+
+def make_pathname(filename, seed, modelname, counter, time_format):
+ filename = filename.replace("%date", get_timestamp("%Y-%m-%d"))
+ filename = filename.replace("%time", get_timestamp(time_format))
+ filename = filename.replace("%model", modelname)
+ filename = filename.replace("%seed", str(seed))
+ filename = filename.replace("%counter", str(counter))
+ return filename
+
+
+def make_filename(filename, seed, modelname, counter, time_format):
+ filename = make_pathname(filename, seed, modelname, counter, time_format)
+
+ return get_timestamp(time_format) if filename == "" else filename
+
+
+class SeedGenerator:
+ RETURN_TYPES = ("INT",)
+ FUNCTION = "get_seed"
+ CATEGORY = "ImageSaverTools/utils"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff})}}
+
+ def get_seed(self, seed):
+ return (seed,)
+
+
+class StringLiteral:
+ RETURN_TYPES = ("STRING",)
+ FUNCTION = "get_string"
+ CATEGORY = "ImageSaverTools/utils"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"string": ("STRING", {"default": "", "multiline": True})}}
+
+ def get_string(self, string):
+ return (string,)
+
+
+class SizeLiteral:
+ RETURN_TYPES = ("INT",)
+ FUNCTION = "get_int"
+ CATEGORY = "ImageSaverTools/utils"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"int": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 8})}}
+
+ def get_int(self, int):
+ return (int,)
+
+
+class IntLiteral:
+ RETURN_TYPES = ("INT",)
+ FUNCTION = "get_int"
+ CATEGORY = "ImageSaverTools/utils"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"int": ("INT", {"default": 0, "min": 0, "max": 1000000})}}
+
+ def get_int(self, int):
+ return (int,)
+
+
+class CfgLiteral:
+ RETURN_TYPES = ("FLOAT",)
+ FUNCTION = "get_float"
+ CATEGORY = "ImageSaverTools/utils"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"float": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0})}}
+
+ def get_float(self, float):
+ return (float,)
+
+
+class CheckpointSelector:
+ CATEGORY = 'ImageSaverTools/utils'
+ RETURN_TYPES = (folder_paths.get_filename_list("checkpoints"),)
+ RETURN_NAMES = ("ckpt_name",)
+ FUNCTION = "get_names"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"ckpt_name": (folder_paths.get_filename_list("checkpoints"), ),}}
+
+ def get_names(self, ckpt_name):
+ return (ckpt_name,)
+
+
+class SamplerSelector:
+ CATEGORY = 'ImageSaverTools/utils'
+ RETURN_TYPES = (comfy.samplers.KSampler.SAMPLERS,)
+ RETURN_NAMES = ("sampler_name",)
+ FUNCTION = "get_names"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"sampler_name": (comfy.samplers.KSampler.SAMPLERS,)}}
+
+ def get_names(self, sampler_name):
+ return (sampler_name,)
+
+
+class SchedulerSelector:
+ CATEGORY = 'ImageSaverTools/utils'
+ RETURN_TYPES = (comfy.samplers.KSampler.SCHEDULERS,)
+ RETURN_NAMES = ("scheduler",)
+ FUNCTION = "get_names"
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {"required": {"scheduler": (comfy.samplers.KSampler.SCHEDULERS,)}}
+
+ def get_names(self, scheduler):
+ return (scheduler,)
+
+
+class ImageSaveWithMetadata:
+ def __init__(self):
+ self.output_dir = folder_paths.output_directory
+
+ @classmethod
+ def INPUT_TYPES(cls):
+ return {
+ "required": {
+ "images": ("IMAGE", ),
+ "filename": ("STRING", {"default": f'%time_%seed', "multiline": False}),
+ "path": ("STRING", {"default": '', "multiline": False}),
+ "extension": (['png', 'jpeg', 'webp'],),
+ "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+ "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0}),
+ "modelname": (folder_paths.get_filename_list("checkpoints"),),
+ "sampler_name": (comfy.samplers.KSampler.SAMPLERS,),
+ "scheduler": (comfy.samplers.KSampler.SCHEDULERS,),
+ },
+ "optional": {
+ "positive": ("STRING", {"default": 'unknown', "multiline": True}),
+ "negative": ("STRING", {"default": 'unknown', "multiline": True}),
+ "seed_value": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+ "width": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 8}),
+ "height": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 8}),
+ "lossless_webp": ("BOOLEAN", {"default": True}),
+ "quality_jpeg_or_webp": ("INT", {"default": 100, "min": 1, "max": 100}),
+ "counter": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff }),
+ "time_format": ("STRING", {"default": "%Y-%m-%d-%H%M%S", "multiline": False}),
+ },
+ "hidden": {
+ "prompt": "PROMPT",
+ "extra_pnginfo": "EXTRA_PNGINFO"
+ },
+ }
+
+ RETURN_TYPES = ()
+ FUNCTION = "save_files"
+
+ OUTPUT_NODE = True
+
+ CATEGORY = "ImageSaverTools"
+
+ def save_files(self, images, seed_value, steps, cfg, sampler_name, scheduler, positive, negative, modelname, quality_jpeg_or_webp,
+ lossless_webp, width, height, counter, filename, path, extension, time_format, prompt=None, extra_pnginfo=None):
+ filename = make_filename(filename, seed_value, modelname, counter, time_format)
+ path = make_pathname(path, seed_value, modelname, counter, time_format)
+ ckpt_path = folder_paths.get_full_path("checkpoints", modelname)
+ basemodelname = parse_name(modelname)
+ modelhash = calculate_sha256(ckpt_path)[:10]
+ comment = f"{handle_whitespace(positive)}\nNegative prompt: {handle_whitespace(negative)}\nSteps: {steps}, Sampler: {sampler_name}{f'_{scheduler}' if scheduler != 'normal' else ''}, CFG Scale: {cfg}, Seed: {seed_value}, Size: {width}x{height}, Model hash: {modelhash}, Model: {basemodelname}, Version: ComfyUI"
+ output_path = os.path.join(self.output_dir, path)
+
+ if output_path.strip() != '':
+ if not os.path.exists(output_path.strip()):
+ print(f'The path `{output_path.strip()}` specified doesn\'t exist! Creating directory.')
+ os.makedirs(output_path, exist_ok=True)
+
+ filenames = self.save_images(images, output_path, filename, comment, extension, quality_jpeg_or_webp, lossless_webp, prompt, extra_pnginfo)
+
+ subfolder = os.path.normpath(path)
+ return {"ui": {"images": map(lambda filename: {"filename": filename, "subfolder": subfolder if subfolder != '.' else '', "type": 'output'}, filenames)}}
+
+ def save_images(self, images, output_path, filename_prefix, comment, extension, quality_jpeg_or_webp, lossless_webp, prompt=None, extra_pnginfo=None) -> list[str]:
+ img_count = 1
+ paths = list()
+ for image in images:
+ i = 255. * image.cpu().numpy()
+ img = Image.fromarray(np.clip(i, 0, 255).astype(np.uint8))
+ if images.size()[0] > 1:
+ filename_prefix += "_{:02d}".format(img_count)
+
+ if extension == 'png':
+ metadata = PngInfo()
+ metadata.add_text("parameters", comment)
+
+ if prompt is not None:
+ metadata.add_text("prompt", json.dumps(prompt))
+ if extra_pnginfo is not None:
+ for x in extra_pnginfo:
+ metadata.add_text(x, json.dumps(extra_pnginfo[x]))
+
+ filename = f"{filename_prefix}.png"
+ img.save(os.path.join(output_path, filename), pnginfo=metadata, optimize=True)
+ else:
+ filename = f"{filename_prefix}.{extension}"
+ file = os.path.join(output_path, filename)
+ img.save(file, optimize=True, quality=quality_jpeg_or_webp, lossless=lossless_webp)
+ exif_bytes = piexif.dump({
+ "Exif": {
+ piexif.ExifIFD.UserComment: piexif.helper.UserComment.dump(comment, encoding="unicode")
+ },
+ })
+ piexif.insert(exif_bytes, file)
+
+ paths.append(filename)
+ img_count += 1
+ return paths
+
+
+NODE_CLASS_MAPPINGS = {
+ "Checkpoint Selector": CheckpointSelector,
+ "Save Image w/Metadata": ImageSaveWithMetadata,
+ "Sampler Selector": SamplerSelector,
+ "Scheduler Selector": SchedulerSelector,
+ "Seed Generator": SeedGenerator,
+ "String Literal": StringLiteral,
+ "Width/Height Literal": SizeLiteral,
+ "Cfg Literal": CfgLiteral,
+ "Int Literal": IntLiteral,
+}
diff --git a/comfy-image-saver/requirements.txt b/comfy-image-saver/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dfe1a52d5f0dc4df0b4709bbfa904ac4a967f416
--- /dev/null
+++ b/comfy-image-saver/requirements.txt
@@ -0,0 +1,2 @@
+
+piexif
diff --git a/comfyui_controlnet_aux/.github/workflows/publish.yml b/comfyui_controlnet_aux/.github/workflows/publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6e7201833e56c009e347731016b54e1c6d2254ab
--- /dev/null
+++ b/comfyui_controlnet_aux/.github/workflows/publish.yml
@@ -0,0 +1,21 @@
+name: Publish to Comfy registry
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ paths:
+ - "pyproject.toml"
+
+jobs:
+ publish-node:
+ name: Publish Custom Node to registry
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out code
+ uses: actions/checkout@v4
+ - name: Publish Custom Node
+ uses: Comfy-Org/publish-node-action@main
+ with:
+ ## Add your own personal access token to your Github Repository secrets and reference it here.
+ personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/.gitignore b/comfyui_controlnet_aux/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dea9a0eae9e8290e6c764093c5c5a74d0762afea
--- /dev/null
+++ b/comfyui_controlnet_aux/.gitignore
@@ -0,0 +1,183 @@
+# Initially taken from Github's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+tests/outputs
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# vscode
+.vs
+.vscode
+
+# Pycharm
+.idea
+
+# TF code
+tensorflow_code
+
+# Models
+proc_data
+
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+
+# data
+/data
+serialization_dir
+
+# emacs
+*.*~
+debug.env
+
+# vim
+.*.swp
+
+#ctags
+tags
+
+# pre-commit
+.pre-commit*
+
+# .lock
+*.lock
+
+# DS_Store (MacOS)
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4
+
+# dependencies
+/transformers
+
+# ruff
+.ruff_cache
+
+wandb
+
+ckpts/
+
+test.ipynb
+config.yaml
+test.ipynb
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/LICENSE.txt b/comfyui_controlnet_aux/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29f81d812f3e768fa89638d1f72920dbfd1413a8
--- /dev/null
+++ b/comfyui_controlnet_aux/LICENSE.txt
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/comfyui_controlnet_aux/NotoSans-Regular.ttf b/comfyui_controlnet_aux/NotoSans-Regular.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..a1b8994edeacd70067de843a4691b15a0ce5921b
Binary files /dev/null and b/comfyui_controlnet_aux/NotoSans-Regular.ttf differ
diff --git a/comfyui_controlnet_aux/README.md b/comfyui_controlnet_aux/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..09c7840ebc9c375c87ad65a3f1c658bc0c1855fd
--- /dev/null
+++ b/comfyui_controlnet_aux/README.md
@@ -0,0 +1,252 @@
+# ComfyUI's ControlNet Auxiliary Preprocessors
+Plug-and-play [ComfyUI](https://github.com/comfyanonymous/ComfyUI) node sets for making [ControlNet](https://github.com/lllyasviel/ControlNet/) hint images
+
+"anime style, a protest in the street, cyberpunk city, a woman with pink hair and golden eyes (looking at the viewer) is holding a sign with the text "ComfyUI ControlNet Aux" in bold, neon pink" on Flux.1 Dev
+
+![](./examples/CNAuxBanner.jpg)
+
+The code is copy-pasted from the respective folders in https://github.com/lllyasviel/ControlNet/tree/main/annotator and connected to [the 🤗 Hub](https://huggingface.co/lllyasviel/Annotators).
+
+All credit & copyright goes to https://github.com/lllyasviel.
+
+# Updates
+Go to [Update page](./UPDATES.md) to follow updates
+
+# Installation:
+## Using ComfyUI Manager (recommended):
+Install [ComfyUI Manager](https://github.com/ltdrdata/ComfyUI-Manager) and do steps introduced there to install this repo.
+
+## Alternative:
+If you're running on Linux, or non-admin account on windows you'll want to ensure `/ComfyUI/custom_nodes` and `comfyui_controlnet_aux` has write permissions.
+
+There is now a **install.bat** you can run to install to portable if detected. Otherwise it will default to system and assume you followed ConfyUI's manual installation steps.
+
+If you can't run **install.bat** (e.g. you are a Linux user). Open the CMD/Shell and do the following:
+ - Navigate to your `/ComfyUI/custom_nodes/` folder
+ - Run `git clone https://github.com/Fannovel16/comfyui_controlnet_aux/`
+ - Navigate to your `comfyui_controlnet_aux` folder
+ - Portable/venv:
+ - Run `path/to/ComfUI/python_embeded/python.exe -s -m pip install -r requirements.txt`
+ - With system python
+ - Run `pip install -r requirements.txt`
+ - Start ComfyUI
+
+# Nodes
+Please note that this repo only supports preprocessors making hint images (e.g. stickman, canny edge, etc).
+All preprocessors except Inpaint are intergrated into `AIO Aux Preprocessor` node.
+This node allow you to quickly get the preprocessor but a preprocessor's own threshold parameters won't be able to set.
+You need to use its node directly to set thresholds.
+
+# Nodes (sections are categories in Comfy menu)
+## Line Extractors
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| Binary Lines | binary | control_scribble |
+| Canny Edge | canny | control_v11p_sd15_canny control_canny t2iadapter_canny |
+| HED Soft-Edge Lines | hed | control_v11p_sd15_softedge control_hed |
+| Standard Lineart | standard_lineart | control_v11p_sd15_lineart |
+| Realistic Lineart | lineart (or `lineart_coarse` if `coarse` is enabled) | control_v11p_sd15_lineart |
+| Anime Lineart | lineart_anime | control_v11p_sd15s2_lineart_anime |
+| Manga Lineart | lineart_anime_denoise | control_v11p_sd15s2_lineart_anime |
+| M-LSD Lines | mlsd | control_v11p_sd15_mlsd control_mlsd |
+| PiDiNet Soft-Edge Lines | pidinet | control_v11p_sd15_softedge control_scribble |
+| Scribble Lines | scribble | control_v11p_sd15_scribble control_scribble |
+| Scribble XDoG Lines | scribble_xdog | control_v11p_sd15_scribble control_scribble |
+| Fake Scribble Lines | scribble_hed | control_v11p_sd15_scribble control_scribble |
+| TEED Soft-Edge Lines | teed | [controlnet-sd-xl-1.0-softedge-dexined](https://huggingface.co/SargeZT/controlnet-sd-xl-1.0-softedge-dexined/blob/main/controlnet-sd-xl-1.0-softedge-dexined.safetensors) control_v11p_sd15_softedge (Theoretically)
+| Scribble PiDiNet Lines | scribble_pidinet | control_v11p_sd15_scribble control_scribble |
+| AnyLine Lineart | | mistoLine_fp16.safetensors mistoLine_rank256 control_v11p_sd15s2_lineart_anime control_v11p_sd15_lineart |
+
+## Normal and Depth Estimators
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| MiDaS Depth Map | (normal) depth | control_v11f1p_sd15_depth control_depth t2iadapter_depth |
+| LeReS Depth Map | depth_leres | control_v11f1p_sd15_depth control_depth t2iadapter_depth |
+| Zoe Depth Map | depth_zoe | control_v11f1p_sd15_depth control_depth t2iadapter_depth |
+| MiDaS Normal Map | normal_map | control_normal |
+| BAE Normal Map | normal_bae | control_v11p_sd15_normalbae |
+| MeshGraphormer Hand Refiner ([HandRefinder](https://github.com/wenquanlu/HandRefiner)) | depth_hand_refiner | [control_sd15_inpaint_depth_hand_fp16](https://huggingface.co/hr16/ControlNet-HandRefiner-pruned/blob/main/control_sd15_inpaint_depth_hand_fp16.safetensors) |
+| Depth Anything | depth_anything | [Depth-Anything](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_controlnet/diffusion_pytorch_model.safetensors) |
+| Zoe Depth Anything (Basically Zoe but the encoder is replaced with DepthAnything) | depth_anything | [Depth-Anything](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_controlnet/diffusion_pytorch_model.safetensors) |
+| Normal DSINE | | control_normal/control_v11p_sd15_normalbae |
+| Metric3D Depth | | control_v11f1p_sd15_depth control_depth t2iadapter_depth |
+| Metric3D Normal | | control_v11p_sd15_normalbae |
+| Depth Anything V2 | | [Depth-Anything](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_controlnet/diffusion_pytorch_model.safetensors) |
+
+## Faces and Poses Estimators
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| DWPose Estimator | dw_openpose_full | control_v11p_sd15_openpose control_openpose t2iadapter_openpose |
+| OpenPose Estimator | openpose (detect_body) openpose_hand (detect_body + detect_hand) openpose_faceonly (detect_face) openpose_full (detect_hand + detect_body + detect_face) | control_v11p_sd15_openpose control_openpose t2iadapter_openpose |
+| MediaPipe Face Mesh | mediapipe_face | controlnet_sd21_laion_face_v2 |
+| Animal Estimator | animal_openpose | [control_sd15_animal_openpose_fp16](https://huggingface.co/huchenlei/animal_openpose/blob/main/control_sd15_animal_openpose_fp16.pth) |
+
+## Optical Flow Estimators
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| Unimatch Optical Flow | | [DragNUWA](https://github.com/ProjectNUWA/DragNUWA) |
+
+### How to get OpenPose-format JSON?
+#### User-side
+This workflow will save images to ComfyUI's output folder (the same location as output images). If you haven't found `Save Pose Keypoints` node, update this extension
+![](./examples/example_save_kps.png)
+
+#### Dev-side
+An array of [OpenPose-format JSON](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md#json-output-format) corresponsding to each frame in an IMAGE batch can be gotten from DWPose and OpenPose using `app.nodeOutputs` on the UI or `/history` API endpoint. JSON output from AnimalPose uses a kinda similar format to OpenPose JSON:
+```
+[
+ {
+ "version": "ap10k",
+ "animals": [
+ [[x1, y1, 1], [x2, y2, 1],..., [x17, y17, 1]],
+ [[x1, y1, 1], [x2, y2, 1],..., [x17, y17, 1]],
+ ...
+ ],
+ "canvas_height": 512,
+ "canvas_width": 768
+ },
+ ...
+]
+```
+
+For extension developers (e.g. Openpose editor):
+```js
+const poseNodes = app.graph._nodes.filter(node => ["OpenposePreprocessor", "DWPreprocessor", "AnimalPosePreprocessor"].includes(node.type))
+for (const poseNode of poseNodes) {
+ const openposeResults = JSON.parse(app.nodeOutputs[poseNode.id].openpose_json[0])
+ console.log(openposeResults) //An array containing Openpose JSON for each frame
+}
+```
+
+For API users:
+Javascript
+```js
+import fetch from "node-fetch" //Remember to add "type": "module" to "package.json"
+async function main() {
+ const promptId = '792c1905-ecfe-41f4-8114-83e6a4a09a9f' //Too lazy to POST /queue
+ let history = await fetch(`http://127.0.0.1:8188/history/${promptId}`).then(re => re.json())
+ history = history[promptId]
+ const nodeOutputs = Object.values(history.outputs).filter(output => output.openpose_json)
+ for (const nodeOutput of nodeOutputs) {
+ const openposeResults = JSON.parse(nodeOutput.openpose_json[0])
+ console.log(openposeResults) //An array containing Openpose JSON for each frame
+ }
+}
+main()
+```
+
+Python
+```py
+import json, urllib.request
+
+server_address = "127.0.0.1:8188"
+prompt_id = '' #Too lazy to POST /queue
+
+def get_history(prompt_id):
+ with urllib.request.urlopen("http://{}/history/{}".format(server_address, prompt_id)) as response:
+ return json.loads(response.read())
+
+history = get_history(prompt_id)[prompt_id]
+for o in history['outputs']:
+ for node_id in history['outputs']:
+ node_output = history['outputs'][node_id]
+ if 'openpose_json' in node_output:
+ print(json.loads(node_output['openpose_json'][0])) #An list containing Openpose JSON for each frame
+```
+## Semantic Segmentation
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| OneFormer ADE20K Segmentor | oneformer_ade20k | control_v11p_sd15_seg |
+| OneFormer COCO Segmentor | oneformer_coco | control_v11p_sd15_seg |
+| UniFormer Segmentor | segmentation |control_sd15_seg control_v11p_sd15_seg|
+
+## T2IAdapter-only
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| Color Pallete | color | t2iadapter_color |
+| Content Shuffle | shuffle | t2iadapter_style |
+
+## Recolor
+| Preprocessor Node | sd-webui-controlnet/other | ControlNet/T2I-Adapter |
+|-----------------------------|---------------------------|-------------------------------------------|
+| Image Luminance | recolor_luminance | [ioclab_sd15_recolor](https://huggingface.co/lllyasviel/sd_control_collection/resolve/main/ioclab_sd15_recolor.safetensors) [sai_xl_recolor_256lora](https://huggingface.co/lllyasviel/sd_control_collection/resolve/main/sai_xl_recolor_256lora.safetensors) [bdsqlsz_controlllite_xl_recolor_luminance](https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/bdsqlsz_controlllite_xl_recolor_luminance.safetensors) |
+| Image Intensity | recolor_intensity | Idk. Maybe same as above? |
+
+# Examples
+> A picture is worth a thousand words
+
+![](./examples/ExecuteAll1.jpg)
+![](./examples/ExecuteAll2.jpg)
+
+# Testing workflow
+https://github.com/Fannovel16/comfyui_controlnet_aux/blob/main/examples/ExecuteAll.png
+Input image: https://github.com/Fannovel16/comfyui_controlnet_aux/blob/main/examples/comfyui-controlnet-aux-logo.png
+
+# Q&A:
+## Why some nodes doesn't appear after I installed this repo?
+
+This repo has a new mechanism which will skip any custom node can't be imported. If you meet this case, please create a issue on [Issues tab](https://github.com/Fannovel16/comfyui_controlnet_aux/issues) with the log from the command line.
+
+## DWPose/AnimalPose only uses CPU so it's so slow. How can I make it use GPU?
+There are two ways to speed-up DWPose: using TorchScript checkpoints (.torchscript.pt) checkpoints or ONNXRuntime (.onnx). TorchScript way is little bit slower than ONNXRuntime but doesn't require any additional library and still way way faster than CPU.
+
+A torchscript bbox detector is compatiable with an onnx pose estimator and vice versa.
+### TorchScript
+Set `bbox_detector` and `pose_estimator` according to this picture. You can try other bbox detector endings with `.torchscript.pt` to reduce bbox detection time if input images are ideal.
+![](./examples/example_torchscript.png)
+### ONNXRuntime
+If onnxruntime is installed successfully and the checkpoint used endings with `.onnx`, it will replace default cv2 backend to take advantage of GPU. Note that if you are using NVidia card, this method currently can only works on CUDA 11.8 (ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z) unless you compile onnxruntime yourself.
+
+1. Know your onnxruntime build:
+* * NVidia CUDA 11.x or bellow/AMD GPU: `onnxruntime-gpu`
+* * NVidia CUDA 12.x: `onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/`
+* * DirectML: `onnxruntime-directml`
+* * OpenVINO: `onnxruntime-openvino`
+
+Note that if this is your first time using ComfyUI, please test if it can run on your device before doing next steps.
+
+2. Add it into `requirements.txt`
+
+3. Run `install.bat` or pip command mentioned in Installation
+
+![](./examples/example_onnx.png)
+
+# Assets files of preprocessors
+* anime_face_segment: [bdsqlsz/qinglong_controlnet-lllite/Annotators/UNet.pth](https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/blob/main/Annotators/UNet.pth), [anime-seg/isnetis.ckpt](https://huggingface.co/skytnt/anime-seg/blob/main/isnetis.ckpt)
+* densepose: [LayerNorm/DensePose-TorchScript-with-hint-image/densepose_r50_fpn_dl.torchscript](https://huggingface.co/LayerNorm/DensePose-TorchScript-with-hint-image/blob/main/densepose_r50_fpn_dl.torchscript)
+* dwpose:
+* * bbox_detector: Either [yzd-v/DWPose/yolox_l.onnx](https://huggingface.co/yzd-v/DWPose/blob/main/yolox_l.onnx), [hr16/yolox-onnx/yolox_l.torchscript.pt](https://huggingface.co/hr16/yolox-onnx/blob/main/yolox_l.torchscript.pt), [hr16/yolo-nas-fp16/yolo_nas_l_fp16.onnx](https://huggingface.co/hr16/yolo-nas-fp16/blob/main/yolo_nas_l_fp16.onnx), [hr16/yolo-nas-fp16/yolo_nas_m_fp16.onnx](https://huggingface.co/hr16/yolo-nas-fp16/blob/main/yolo_nas_m_fp16.onnx), [hr16/yolo-nas-fp16/yolo_nas_s_fp16.onnx](https://huggingface.co/hr16/yolo-nas-fp16/blob/main/yolo_nas_s_fp16.onnx)
+* * pose_estimator: Either [hr16/DWPose-TorchScript-BatchSize5/dw-ll_ucoco_384_bs5.torchscript.pt](https://huggingface.co/hr16/DWPose-TorchScript-BatchSize5/blob/main/dw-ll_ucoco_384_bs5.torchscript.pt), [yzd-v/DWPose/dw-ll_ucoco_384.onnx](https://huggingface.co/yzd-v/DWPose/blob/main/dw-ll_ucoco_384.onnx)
+* animal_pose (ap10k):
+* * bbox_detector: Either [yzd-v/DWPose/yolox_l.onnx](https://huggingface.co/yzd-v/DWPose/blob/main/yolox_l.onnx), [hr16/yolox-onnx/yolox_l.torchscript.pt](https://huggingface.co/hr16/yolox-onnx/blob/main/yolox_l.torchscript.pt), [hr16/yolo-nas-fp16/yolo_nas_l_fp16.onnx](https://huggingface.co/hr16/yolo-nas-fp16/blob/main/yolo_nas_l_fp16.onnx), [hr16/yolo-nas-fp16/yolo_nas_m_fp16.onnx](https://huggingface.co/hr16/yolo-nas-fp16/blob/main/yolo_nas_m_fp16.onnx), [hr16/yolo-nas-fp16/yolo_nas_s_fp16.onnx](https://huggingface.co/hr16/yolo-nas-fp16/blob/main/yolo_nas_s_fp16.onnx)
+* * pose_estimator: Either [hr16/DWPose-TorchScript-BatchSize5/rtmpose-m_ap10k_256_bs5.torchscript.pt](https://huggingface.co/hr16/DWPose-TorchScript-BatchSize5/blob/main/rtmpose-m_ap10k_256_bs5.torchscript.pt), [hr16/UnJIT-DWPose/rtmpose-m_ap10k_256.onnx](https://huggingface.co/hr16/UnJIT-DWPose/blob/main/rtmpose-m_ap10k_256.onnx)
+* hed: [lllyasviel/Annotators/ControlNetHED.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/ControlNetHED.pth)
+* leres: [lllyasviel/Annotators/res101.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/res101.pth), [lllyasviel/Annotators/latest_net_G.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/latest_net_G.pth)
+* lineart: [lllyasviel/Annotators/sk_model.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/sk_model.pth), [lllyasviel/Annotators/sk_model2.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/sk_model2.pth)
+* lineart_anime: [lllyasviel/Annotators/netG.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/netG.pth)
+* manga_line: [lllyasviel/Annotators/erika.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/erika.pth)
+* mesh_graphormer: [hr16/ControlNet-HandRefiner-pruned/graphormer_hand_state_dict.bin](https://huggingface.co/hr16/ControlNet-HandRefiner-pruned/blob/main/graphormer_hand_state_dict.bin), [hr16/ControlNet-HandRefiner-pruned/hrnetv2_w64_imagenet_pretrained.pth](https://huggingface.co/hr16/ControlNet-HandRefiner-pruned/blob/main/hrnetv2_w64_imagenet_pretrained.pth)
+* midas: [lllyasviel/Annotators/dpt_hybrid-midas-501f0c75.pt](https://huggingface.co/lllyasviel/Annotators/blob/main/dpt_hybrid-midas-501f0c75.pt)
+* mlsd: [lllyasviel/Annotators/mlsd_large_512_fp32.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/mlsd_large_512_fp32.pth)
+* normalbae: [lllyasviel/Annotators/scannet.pt](https://huggingface.co/lllyasviel/Annotators/blob/main/scannet.pt)
+* oneformer: [lllyasviel/Annotators/250_16_swin_l_oneformer_ade20k_160k.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/250_16_swin_l_oneformer_ade20k_160k.pth)
+* open_pose: [lllyasviel/Annotators/body_pose_model.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/body_pose_model.pth), [lllyasviel/Annotators/hand_pose_model.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/hand_pose_model.pth), [lllyasviel/Annotators/facenet.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/facenet.pth)
+* pidi: [lllyasviel/Annotators/table5_pidinet.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/table5_pidinet.pth)
+* sam: [dhkim2810/MobileSAM/mobile_sam.pt](https://huggingface.co/dhkim2810/MobileSAM/blob/main/mobile_sam.pt)
+* uniformer: [lllyasviel/Annotators/upernet_global_small.pth](https://huggingface.co/lllyasviel/Annotators/blob/main/upernet_global_small.pth)
+* zoe: [lllyasviel/Annotators/ZoeD_M12_N.pt](https://huggingface.co/lllyasviel/Annotators/blob/main/ZoeD_M12_N.pt)
+* teed: [bdsqlsz/qinglong_controlnet-lllite/7_model.pth](https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/blob/main/Annotators/7_model.pth)
+* depth_anything: Either [LiheYoung/Depth-Anything/checkpoints/depth_anything_vitl14.pth](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitl14.pth), [LiheYoung/Depth-Anything/checkpoints/depth_anything_vitb14.pth](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitb14.pth) or [LiheYoung/Depth-Anything/checkpoints/depth_anything_vits14.pth](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vits14.pth)
+* diffusion_edge: Either [hr16/Diffusion-Edge/diffusion_edge_indoor.pt](https://huggingface.co/hr16/Diffusion-Edge/blob/main/diffusion_edge_indoor.pt), [hr16/Diffusion-Edge/diffusion_edge_urban.pt](https://huggingface.co/hr16/Diffusion-Edge/blob/main/diffusion_edge_urban.pt) or [hr16/Diffusion-Edge/diffusion_edge_natrual.pt](https://huggingface.co/hr16/Diffusion-Edge/blob/main/diffusion_edge_natrual.pt)
+* unimatch: Either [hr16/Unimatch/gmflow-scale2-regrefine6-mixdata.pth](https://huggingface.co/hr16/Unimatch/blob/main/gmflow-scale2-regrefine6-mixdata.pth), [hr16/Unimatch/gmflow-scale2-mixdata.pth](https://huggingface.co/hr16/Unimatch/blob/main/gmflow-scale2-mixdata.pth) or [hr16/Unimatch/gmflow-scale1-mixdata.pth](https://huggingface.co/hr16/Unimatch/blob/main/gmflow-scale1-mixdata.pth)
+* zoe_depth_anything: Either [LiheYoung/Depth-Anything/checkpoints_metric_depth/depth_anything_metric_depth_indoor.pt](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_metric_depth/depth_anything_metric_depth_indoor.pt) or [LiheYoung/Depth-Anything/checkpoints_metric_depth/depth_anything_metric_depth_outdoor.pt](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_metric_depth/depth_anything_metric_depth_outdoor.pt)
+# 2000 Stars 😄
+
+
+
+
+
+
+
+
+Thanks for yalls supports. I never thought the graph for stars would be linear lol.
diff --git a/comfyui_controlnet_aux/UPDATES.md b/comfyui_controlnet_aux/UPDATES.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b42272ec5ffa61f915c96bd45d6ffb34a6e0e4d
--- /dev/null
+++ b/comfyui_controlnet_aux/UPDATES.md
@@ -0,0 +1,44 @@
+* `AIO Aux Preprocessor` intergrating all loadable aux preprocessors as dropdown options. Easy to copy, paste and get the preprocessor faster.
+* Added OpenPose-format JSON output from OpenPose Preprocessor and DWPose Preprocessor. Checks [here](#faces-and-poses).
+* Fixed wrong model path when downloading DWPose.
+* Make hint images less blurry.
+* Added `resolution` option, `PixelPerfectResolution` and `HintImageEnchance` nodes (TODO: Documentation).
+* Added `RAFT Optical Flow Embedder` for TemporalNet2 (TODO: Workflow example).
+* Fixed opencv's conflicts between this extension, [ReActor](https://github.com/Gourieff/comfyui-reactor-node) and Roop. Thanks `Gourieff` for [the solution](https://github.com/Fannovel16/comfyui_controlnet_aux/issues/7#issuecomment-1734319075)!
+* RAFT is removed as the code behind it doesn't match what what the original code does
+* Changed `lineart`'s display name from `Normal Lineart` to `Realistic Lineart`. This change won't affect old workflows
+* Added support for `onnxruntime` to speed-up DWPose (see the Q&A)
+* Fixed TypeError: expected size to be one of int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], but got size with types [, ]: [Issue](https://github.com/Fannovel16/comfyui_controlnet_aux/issues/2), [PR](https://github.com/Fannovel16/comfyui_controlnet_aux/pull/71))
+* Fixed ImageGenResolutionFromImage mishape (https://github.com/Fannovel16/comfyui_controlnet_aux/pull/74)
+* Fixed LeRes and MiDaS's incomatipility with MPS device
+* Fixed checking DWPose onnxruntime session multiple times: https://github.com/Fannovel16/comfyui_controlnet_aux/issues/89)
+* Added `Anime Face Segmentor` (in `ControlNet Preprocessors/Semantic Segmentation`) for [ControlNet AnimeFaceSegmentV2](https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite#animefacesegmentv2). Checks [here](#anime-face-segmentor)
+* Change download functions and fix [download error](https://github.com/Fannovel16/comfyui_controlnet_aux/issues/39): [PR](https://github.com/Fannovel16/comfyui_controlnet_aux/pull/96)
+* Caching DWPose Onnxruntime during the first use of DWPose node instead of ComfyUI startup
+* Added alternative YOLOX models for faster speed when using DWPose
+* Added alternative DWPose models
+* Implemented the preprocessor for [AnimalPose ControlNet](https://github.com/abehonest/ControlNet_AnimalPose/tree/main). Check [Animal Pose AP-10K](#animal-pose-ap-10k)
+* Added YOLO-NAS models which are drop-in replacements of YOLOX
+* Fixed Openpose Face/Hands no longer detecting: https://github.com/Fannovel16/comfyui_controlnet_aux/issues/54
+* Added TorchScript implementation of DWPose and AnimalPose
+* Added TorchScript implementation of DensePose from [Colab notebook](https://colab.research.google.com/drive/16hcaaKs210ivpxjoyGNuvEXZD4eqOOSQ) which doesn't require detectron2. [Example](#densepose). Thanks [@LayerNome](https://github.com/Layer-norm) for fixing bugs related.
+* Added Standard Lineart Preprocessor
+* Fixed OpenPose misplacements in some cases
+* Added Mesh Graphormer - Hand Depth Map & Mask
+* Misaligned hands bug from MeshGraphormer was fixed
+* Added more mask options for MeshGraphormer
+* Added Save Pose Keypoint node for editing
+* Added Unimatch Optical Flow
+* Added Depth Anything & Zoe Depth Anything
+* Removed resolution field from Unimatch Optical Flow as that interpolating optical flow seems unstable
+* Added TEED Soft-Edge Preprocessor
+* Added DiffusionEdge
+* Added Image Luminance and Image Intensity
+* Added Normal DSINE
+* Added TTPlanet Tile (09/05/2024, DD/MM/YYYY)
+* Added AnyLine, Metric3D (18/05/2024)
+* Added Depth Anything V2 (16/06/2024)
+* Added Union model of ControlNet and preprocessors
+![345832280-edf41dab-7619-494c-9f60-60ec1f8789cb](https://github.com/user-attachments/assets/aa55f57c-cad7-48e6-84d3-8f506d847989)
+* Refactor INPUT_TYPES and add Execute All node during the process of learning [Execution Model Inversion](https://github.com/comfyanonymous/ComfyUI/pull/2666)
+* Added scale_stick_for_xinsr_cn (https://github.com/Fannovel16/comfyui_controlnet_aux/issues/447) (09/04/2024)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/__init__.py b/comfyui_controlnet_aux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf769583390e3ca91138675efbb482264c728ad4
--- /dev/null
+++ b/comfyui_controlnet_aux/__init__.py
@@ -0,0 +1,214 @@
+import sys, os
+from .utils import here, define_preprocessor_inputs, INPUT
+from pathlib import Path
+import traceback
+import importlib
+from .log import log, blue_text, cyan_text, get_summary, get_label
+from .hint_image_enchance import NODE_CLASS_MAPPINGS as HIE_NODE_CLASS_MAPPINGS
+from .hint_image_enchance import NODE_DISPLAY_NAME_MAPPINGS as HIE_NODE_DISPLAY_NAME_MAPPINGS
+#Ref: https://github.com/comfyanonymous/ComfyUI/blob/76d53c4622fc06372975ed2a43ad345935b8a551/nodes.py#L17
+sys.path.insert(0, str(Path(here, "src").resolve()))
+for pkg_name in ["custom_controlnet_aux", "custom_mmpkg"]:
+ sys.path.append(str(Path(here, "src", pkg_name).resolve()))
+
+#Enable CPU fallback for ops not being supported by MPS like upsample_bicubic2d.out
+#https://github.com/pytorch/pytorch/issues/77764
+#https://github.com/Fannovel16/comfyui_controlnet_aux/issues/2#issuecomment-1763579485
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = os.getenv("PYTORCH_ENABLE_MPS_FALLBACK", '1')
+
+
+def load_nodes():
+ shorted_errors = []
+ full_error_messages = []
+ node_class_mappings = {}
+ node_display_name_mappings = {}
+
+ for filename in (here / "node_wrappers").iterdir():
+ module_name = filename.stem
+ if module_name.startswith('.'): continue #Skip hidden files created by the OS (e.g. [.DS_Store](https://en.wikipedia.org/wiki/.DS_Store))
+ try:
+ module = importlib.import_module(
+ f".node_wrappers.{module_name}", package=__package__
+ )
+ node_class_mappings.update(getattr(module, "NODE_CLASS_MAPPINGS"))
+ if hasattr(module, "NODE_DISPLAY_NAME_MAPPINGS"):
+ node_display_name_mappings.update(getattr(module, "NODE_DISPLAY_NAME_MAPPINGS"))
+
+ log.debug(f"Imported {module_name} nodes")
+
+ except AttributeError:
+ pass # wip nodes
+ except Exception:
+ error_message = traceback.format_exc()
+ full_error_messages.append(error_message)
+ error_message = error_message.splitlines()[-1]
+ shorted_errors.append(
+ f"Failed to import module {module_name} because {error_message}"
+ )
+
+ if len(shorted_errors) > 0:
+ full_err_log = '\n\n'.join(full_error_messages)
+ print(f"\n\nFull error log from comfyui_controlnet_aux: \n{full_err_log}\n\n")
+ log.info(
+ f"Some nodes failed to load:\n\t"
+ + "\n\t".join(shorted_errors)
+ + "\n\n"
+ + "Check that you properly installed the dependencies.\n"
+ + "If you think this is a bug, please report it on the github page (https://github.com/Fannovel16/comfyui_controlnet_aux/issues)"
+ )
+ return node_class_mappings, node_display_name_mappings
+
+AUX_NODE_MAPPINGS, AUX_DISPLAY_NAME_MAPPINGS = load_nodes()
+
+#For nodes not mapping image to image or has special requirements
+AIO_NOT_SUPPORTED = ["InpaintPreprocessor", "MeshGraphormer+ImpactDetector-DepthMapPreprocessor", "DiffusionEdge_Preprocessor"]
+AIO_NOT_SUPPORTED += ["SavePoseKpsAsJsonFile", "FacialPartColoringFromPoseKps", "UpperBodyTrackingFromPoseKps", "RenderPeopleKps", "RenderAnimalKps"]
+AIO_NOT_SUPPORTED += ["Unimatch_OptFlowPreprocessor", "MaskOptFlow"]
+
+def preprocessor_options():
+ auxs = list(AUX_NODE_MAPPINGS.keys())
+ auxs.insert(0, "none")
+ for name in AIO_NOT_SUPPORTED:
+ if name in auxs:
+ auxs.remove(name)
+ return auxs
+
+
+PREPROCESSOR_OPTIONS = preprocessor_options()
+
+class AIO_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ preprocessor=INPUT.COMBO(PREPROCESSOR_OPTIONS, default="none"),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors"
+
+ def execute(self, preprocessor, image, resolution=512):
+ if preprocessor == "none":
+ return (image, )
+ else:
+ aux_class = AUX_NODE_MAPPINGS[preprocessor]
+ input_types = aux_class.INPUT_TYPES()
+ input_types = {
+ **input_types["required"],
+ **(input_types["optional"] if "optional" in input_types else {})
+ }
+ params = {}
+ for name, input_type in input_types.items():
+ if name == "image":
+ params[name] = image
+ continue
+
+ if name == "resolution":
+ params[name] = resolution
+ continue
+
+ if len(input_type) == 2 and ("default" in input_type[1]):
+ params[name] = input_type[1]["default"]
+ continue
+
+ default_values = { "INT": 0, "FLOAT": 0.0 }
+ if input_type[0] in default_values:
+ params[name] = default_values[input_type[0]]
+
+ return getattr(aux_class(), aux_class.FUNCTION)(**params)
+
+class ControlNetAuxSimpleAddText:
+ @classmethod
+ def INPUT_TYPES(s):
+ return dict(
+ required=dict(image=INPUT.IMAGE(), text=INPUT.STRING())
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+ CATEGORY = "ControlNet Preprocessors"
+ def execute(self, image, text):
+ from PIL import Image, ImageDraw, ImageFont
+ import numpy as np
+ import torch
+
+ font = ImageFont.truetype(str((here / "NotoSans-Regular.ttf").resolve()), 40)
+ img = Image.fromarray(image[0].cpu().numpy().__mul__(255.).astype(np.uint8))
+ ImageDraw.Draw(img).text((0,0), text, fill=(0,255,0), font=font)
+ return (torch.from_numpy(np.array(img)).unsqueeze(0) / 255.,)
+
+class ExecuteAllControlNetPreprocessors:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors"
+
+ def execute(self, image, resolution=512):
+ try:
+ from comfy_execution.graph_utils import GraphBuilder
+ except:
+ raise RuntimeError("ExecuteAllControlNetPreprocessor requries [Execution Model Inversion](https://github.com/comfyanonymous/ComfyUI/commit/5cfe38). Update ComfyUI/SwarmUI to get this feature")
+
+ graph = GraphBuilder()
+ curr_outputs = []
+ for preprocc in PREPROCESSOR_OPTIONS:
+ preprocc_node = graph.node("AIO_Preprocessor", preprocessor=preprocc, image=image, resolution=resolution)
+ hint_img = preprocc_node.out(0)
+ add_text_node = graph.node("ControlNetAuxSimpleAddText", image=hint_img, text=preprocc)
+ curr_outputs.append(add_text_node.out(0))
+
+ while len(curr_outputs) > 1:
+ _outputs = []
+ for i in range(0, len(curr_outputs), 2):
+ if i+1 < len(curr_outputs):
+ image_batch = graph.node("ImageBatch", image1=curr_outputs[i], image2=curr_outputs[i+1])
+ _outputs.append(image_batch.out(0))
+ else:
+ _outputs.append(curr_outputs[i])
+ curr_outputs = _outputs
+
+ return {
+ "result": (curr_outputs[0],),
+ "expand": graph.finalize(),
+ }
+
+class ControlNetPreprocessorSelector:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "preprocessor": (PREPROCESSOR_OPTIONS,),
+ }
+ }
+
+ RETURN_TYPES = (PREPROCESSOR_OPTIONS,)
+ RETURN_NAMES = ("preprocessor",)
+ FUNCTION = "get_preprocessor"
+
+ CATEGORY = "ControlNet Preprocessors"
+
+ def get_preprocessor(self, preprocessor: str):
+ return (preprocessor,)
+
+
+NODE_CLASS_MAPPINGS = {
+ **AUX_NODE_MAPPINGS,
+ "AIO_Preprocessor": AIO_Preprocessor,
+ "ControlNetPreprocessorSelector": ControlNetPreprocessorSelector,
+ **HIE_NODE_CLASS_MAPPINGS,
+ "ExecuteAllControlNetPreprocessors": ExecuteAllControlNetPreprocessors,
+ "ControlNetAuxSimpleAddText": ControlNetAuxSimpleAddText
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ **AUX_DISPLAY_NAME_MAPPINGS,
+ "AIO_Preprocessor": "AIO Aux Preprocessor",
+ "ControlNetPreprocessorSelector": "Preprocessor Selector",
+ **HIE_NODE_DISPLAY_NAME_MAPPINGS,
+ "ExecuteAllControlNetPreprocessors": "Execute All ControlNet Preprocessors"
+}
diff --git a/comfyui_controlnet_aux/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22b2b4c762a4b0f63da75fcf7d48da61461ebbcd
Binary files /dev/null and b/comfyui_controlnet_aux/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/__pycache__/hint_image_enchance.cpython-312.pyc b/comfyui_controlnet_aux/__pycache__/hint_image_enchance.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2a7ce80227c6601518740f202561f1d3fdaa304
Binary files /dev/null and b/comfyui_controlnet_aux/__pycache__/hint_image_enchance.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/__pycache__/log.cpython-312.pyc b/comfyui_controlnet_aux/__pycache__/log.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e1408aa7b3a01420fe73f749e754610e8f9c2d4
Binary files /dev/null and b/comfyui_controlnet_aux/__pycache__/log.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/__pycache__/lvminthin.cpython-312.pyc b/comfyui_controlnet_aux/__pycache__/lvminthin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51580c965348dd45f0946f02e635bcf2043905c9
Binary files /dev/null and b/comfyui_controlnet_aux/__pycache__/lvminthin.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/__pycache__/utils.cpython-312.pyc b/comfyui_controlnet_aux/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79b45d5ed2a77b60323371847245d5139397ee5d
Binary files /dev/null and b/comfyui_controlnet_aux/__pycache__/utils.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/.gitignore b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f59ec20aabf5842d237244ece8c81ab184faeac1
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/.gitignore
@@ -0,0 +1 @@
+*
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/body_pose_model.pth.metadata b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/body_pose_model.pth.metadata
new file mode 100644
index 0000000000000000000000000000000000000000..00a5bb32ae1a51c2cd97d6a7e62fcd5fd718523e
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/body_pose_model.pth.metadata
@@ -0,0 +1,3 @@
+982e7edaec38759d914a963c48c4726685de7d96
+25a948c16078b0f08e236bda51a385d855ef4c153598947c28c0d47ed94bb746
+1730988131.7718472
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/facenet.pth.metadata b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/facenet.pth.metadata
new file mode 100644
index 0000000000000000000000000000000000000000..ef938a898e0cbf7c9e5731e9b97df06bf0ed44da
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/facenet.pth.metadata
@@ -0,0 +1,3 @@
+982e7edaec38759d914a963c48c4726685de7d96
+8beb52e548624ffcc4aed12af7aee7dcbfaeea420c75609fee999fe7add79d43
+1730988141.1523578
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/hand_pose_model.pth.metadata b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/hand_pose_model.pth.metadata
new file mode 100644
index 0000000000000000000000000000000000000000..907360f7f2845ed5a2d2e95651782baeca02b774
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/.cache/huggingface/download/hand_pose_model.pth.metadata
@@ -0,0 +1,3 @@
+982e7edaec38759d914a963c48c4726685de7d96
+b76b00d1750901abd07b9f9d8c98cc3385b8fe834a26d4b4f0aad439e75fc600
+1730988136.5126987
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/body_pose_model.pth b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/body_pose_model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9acb77e68f31906a8875f1daef2f3f7ef94acb1e
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/body_pose_model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25a948c16078b0f08e236bda51a385d855ef4c153598947c28c0d47ed94bb746
+size 209267595
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/facenet.pth b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/facenet.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ccfac27ffec2f25eb02dad5f52512872eb3b53e1
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/facenet.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8beb52e548624ffcc4aed12af7aee7dcbfaeea420c75609fee999fe7add79d43
+size 153718792
diff --git a/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/hand_pose_model.pth b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/hand_pose_model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f23ccf3413cc8ac8581a82338a3037bc10d573f0
--- /dev/null
+++ b/comfyui_controlnet_aux/ckpts/lllyasviel/Annotators/hand_pose_model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b76b00d1750901abd07b9f9d8c98cc3385b8fe834a26d4b4f0aad439e75fc600
+size 147341049
diff --git a/comfyui_controlnet_aux/config.example.yaml b/comfyui_controlnet_aux/config.example.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ddc9ff17f91558fce904e2386914aa5e7a53705a
--- /dev/null
+++ b/comfyui_controlnet_aux/config.example.yaml
@@ -0,0 +1,20 @@
+# this is an example for config.yaml file, you can rename it to config.yaml if you want to use it
+# ###############################################################################################
+# This path is for custom pressesor models base folder. default is "./ckpts"
+# you can also use absolute paths like: "/root/ComfyUI/custom_nodes/comfyui_controlnet_aux/ckpts" or "D:\\ComfyUI\\custom_nodes\\comfyui_controlnet_aux\\ckpts"
+annotator_ckpts_path: "./ckpts"
+# ###############################################################################################
+# This path is for downloading temporary files.
+# You SHOULD use absolute path for this like"D:\\temp", DO NOT use relative paths. Empty for default.
+custom_temp_path:
+# ###############################################################################################
+# if you already have downloaded ckpts via huggingface hub into default cache path like: ~/.cache/huggingface/hub, you can set this True to use symlinks to save space
+USE_SYMLINKS: False
+# ###############################################################################################
+# EP_list is a list of execution providers for onnxruntime, if one of them is not available or not working well, you can delete that provider from here(config.yaml)
+# you can find all available providers here: https://onnxruntime.ai/docs/execution-providers
+# for example, if you have CUDA installed, you can set it to: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+# empty list or only keep ["CPUExecutionProvider"] means you use cv2.dnn.readNetFromONNX to load onnx models
+# if your onnx models can only run on the CPU or have other issues, we recommend using pt model instead.
+# default value is ["CUDAExecutionProvider", "DirectMLExecutionProvider", "OpenVINOExecutionProvider", "ROCMExecutionProvider", "CPUExecutionProvider"]
+EP_list: ["CUDAExecutionProvider", "DirectMLExecutionProvider", "OpenVINOExecutionProvider", "ROCMExecutionProvider", "CPUExecutionProvider"]
diff --git a/comfyui_controlnet_aux/dev_interface.py b/comfyui_controlnet_aux/dev_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..802992142f4326197a2246ee44c29132a7bf2a5d
--- /dev/null
+++ b/comfyui_controlnet_aux/dev_interface.py
@@ -0,0 +1,6 @@
+from pathlib import Path
+from utils import here
+import sys
+sys.path.append(str(Path(here, "src")))
+
+from custom_controlnet_aux import *
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/examples/CNAuxBanner.jpg b/comfyui_controlnet_aux/examples/CNAuxBanner.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3d02d5774493cb9bfc9e3d3e52b6a18fb6c84405
Binary files /dev/null and b/comfyui_controlnet_aux/examples/CNAuxBanner.jpg differ
diff --git a/comfyui_controlnet_aux/examples/ExecuteAll.png b/comfyui_controlnet_aux/examples/ExecuteAll.png
new file mode 100644
index 0000000000000000000000000000000000000000..238e96e327b8a0a2c738591f91a0a77b96a7909c
--- /dev/null
+++ b/comfyui_controlnet_aux/examples/ExecuteAll.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9e9d1f3cb4d13005818cf7c14e04be0635b44e180776dce7f02f715e246d18e
+size 10007102
diff --git a/comfyui_controlnet_aux/examples/ExecuteAll1.jpg b/comfyui_controlnet_aux/examples/ExecuteAll1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c01db1275c3e6d8a334d6d5b38f9c6cafbd4af1d
--- /dev/null
+++ b/comfyui_controlnet_aux/examples/ExecuteAll1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6750043f6866ec52ae348ce108f8b7361c6e30a744cef162f289bbb2296cdad9
+size 1171712
diff --git a/comfyui_controlnet_aux/examples/ExecuteAll2.jpg b/comfyui_controlnet_aux/examples/ExecuteAll2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..03796e2d40734f108d454bfaa073f7dfe82d239a
--- /dev/null
+++ b/comfyui_controlnet_aux/examples/ExecuteAll2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:281478f1d39ab9d2ba8b0aa5d3ab33a34c4dc993e078b3bcca9ffbf024f2505b
+size 1021442
diff --git a/comfyui_controlnet_aux/examples/comfyui-controlnet-aux-logo.png b/comfyui_controlnet_aux/examples/comfyui-controlnet-aux-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..b23a4aea9b728d40475b54330e79b541e1416ca8
Binary files /dev/null and b/comfyui_controlnet_aux/examples/comfyui-controlnet-aux-logo.png differ
diff --git a/comfyui_controlnet_aux/examples/example_animal_pose.png b/comfyui_controlnet_aux/examples/example_animal_pose.png
new file mode 100644
index 0000000000000000000000000000000000000000..11443aff62ef27bfe924be12c7b23a666fa00ff4
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_animal_pose.png differ
diff --git a/comfyui_controlnet_aux/examples/example_anime_face_segmentor.png b/comfyui_controlnet_aux/examples/example_anime_face_segmentor.png
new file mode 100644
index 0000000000000000000000000000000000000000..047d07b2fc65a1a10d72d489d8bd73fb586403a4
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_anime_face_segmentor.png differ
diff --git a/comfyui_controlnet_aux/examples/example_anyline.png b/comfyui_controlnet_aux/examples/example_anyline.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3a936d42efba451bb191fe4d9d4f2f7a627410b
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_anyline.png differ
diff --git a/comfyui_controlnet_aux/examples/example_densepose.png b/comfyui_controlnet_aux/examples/example_densepose.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d971ea70922e273d2d19c669a43319cf6ef2e9c
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_densepose.png differ
diff --git a/comfyui_controlnet_aux/examples/example_depth_anything.png b/comfyui_controlnet_aux/examples/example_depth_anything.png
new file mode 100644
index 0000000000000000000000000000000000000000..c882681cbf223985fa6c373d3a1906707c0d6a81
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_depth_anything.png differ
diff --git a/comfyui_controlnet_aux/examples/example_depth_anything_v2.png b/comfyui_controlnet_aux/examples/example_depth_anything_v2.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc7a2250b8ef258f6f1a20be904f2b4751e5d328
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_depth_anything_v2.png differ
diff --git a/comfyui_controlnet_aux/examples/example_dsine.png b/comfyui_controlnet_aux/examples/example_dsine.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab7a7c5aaf5f32bbac671dfc35df646d37c667ed
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_dsine.png differ
diff --git a/comfyui_controlnet_aux/examples/example_marigold.png b/comfyui_controlnet_aux/examples/example_marigold.png
new file mode 100644
index 0000000000000000000000000000000000000000..067191fb2ccaca09b61a4f49159c16c4377e1932
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_marigold.png differ
diff --git a/comfyui_controlnet_aux/examples/example_marigold_flat.jpg b/comfyui_controlnet_aux/examples/example_marigold_flat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ca1013c4d80c6d322e992c3a721176054e137676
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_marigold_flat.jpg differ
diff --git a/comfyui_controlnet_aux/examples/example_mesh_graphormer.png b/comfyui_controlnet_aux/examples/example_mesh_graphormer.png
new file mode 100644
index 0000000000000000000000000000000000000000..90b162d7c91beda88ccf81a4358ba44914596fd2
--- /dev/null
+++ b/comfyui_controlnet_aux/examples/example_mesh_graphormer.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7692c5df1ee107b95c02455eea4e88f878b59313a98b5447736c9c417c0e182
+size 5481152
diff --git a/comfyui_controlnet_aux/examples/example_metric3d.png b/comfyui_controlnet_aux/examples/example_metric3d.png
new file mode 100644
index 0000000000000000000000000000000000000000..8db42c09e9c2115a81c816d2f4322202ea6a3e37
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_metric3d.png differ
diff --git a/comfyui_controlnet_aux/examples/example_onnx.png b/comfyui_controlnet_aux/examples/example_onnx.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3f9ad5a45e2ce33b03883446b55fb487c059a00
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_onnx.png differ
diff --git a/comfyui_controlnet_aux/examples/example_recolor.png b/comfyui_controlnet_aux/examples/example_recolor.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab94512f2fd62b69ac9602890a037bcbd9d7cb59
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_recolor.png differ
diff --git a/comfyui_controlnet_aux/examples/example_save_kps.png b/comfyui_controlnet_aux/examples/example_save_kps.png
new file mode 100644
index 0000000000000000000000000000000000000000..fca7a6327d04c322532b3a63d205d7ed0e7dd7da
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_save_kps.png differ
diff --git a/comfyui_controlnet_aux/examples/example_teed.png b/comfyui_controlnet_aux/examples/example_teed.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba77307c7e0716a96afd26e7b26ad7819ab3bf2a
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_teed.png differ
diff --git a/comfyui_controlnet_aux/examples/example_torchscript.png b/comfyui_controlnet_aux/examples/example_torchscript.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a685f9cea265c5bc2057f567da0d93614f8ce9a
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_torchscript.png differ
diff --git a/comfyui_controlnet_aux/examples/example_unimatch.png b/comfyui_controlnet_aux/examples/example_unimatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd51e57a195c6a66c5c003472df6f0e8283785b6
Binary files /dev/null and b/comfyui_controlnet_aux/examples/example_unimatch.png differ
diff --git a/comfyui_controlnet_aux/hint_image_enchance.py b/comfyui_controlnet_aux/hint_image_enchance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b931e9dad3d159ad5412a9960c7a482e5621b3b9
--- /dev/null
+++ b/comfyui_controlnet_aux/hint_image_enchance.py
@@ -0,0 +1,233 @@
+from .log import log
+from .utils import ResizeMode, safe_numpy
+import numpy as np
+import torch
+import cv2
+from .utils import get_unique_axis0
+from .lvminthin import nake_nms, lvmin_thin
+
+MAX_IMAGEGEN_RESOLUTION = 8192 #https://github.com/comfyanonymous/ComfyUI/blob/c910b4a01ca58b04e5d4ab4c747680b996ada02b/nodes.py#L42
+RESIZE_MODES = [ResizeMode.RESIZE.value, ResizeMode.INNER_FIT.value, ResizeMode.OUTER_FIT.value]
+
+#Port from https://github.com/Mikubill/sd-webui-controlnet/blob/e67e017731aad05796b9615dc6eadce911298ea1/internal_controlnet/external_code.py#L89
+class PixelPerfectResolution:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "original_image": ("IMAGE", ),
+ "image_gen_width": ("INT", {"default": 512, "min": 64, "max": MAX_IMAGEGEN_RESOLUTION, "step": 8}),
+ "image_gen_height": ("INT", {"default": 512, "min": 64, "max": MAX_IMAGEGEN_RESOLUTION, "step": 8}),
+ #https://github.com/comfyanonymous/ComfyUI/blob/c910b4a01ca58b04e5d4ab4c747680b996ada02b/nodes.py#L854
+ "resize_mode": (RESIZE_MODES, {"default": ResizeMode.RESIZE.value})
+ }
+ }
+
+ RETURN_TYPES = ("INT",)
+ RETURN_NAMES = ("RESOLUTION (INT)", )
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors"
+
+ def execute(self, original_image, image_gen_width, image_gen_height, resize_mode):
+ _, raw_H, raw_W, _ = original_image.shape
+
+ k0 = float(image_gen_height) / float(raw_H)
+ k1 = float(image_gen_width) / float(raw_W)
+
+ if resize_mode == ResizeMode.OUTER_FIT.value:
+ estimation = min(k0, k1) * float(min(raw_H, raw_W))
+ else:
+ estimation = max(k0, k1) * float(min(raw_H, raw_W))
+
+ log.debug(f"Pixel Perfect Computation:")
+ log.debug(f"resize_mode = {resize_mode}")
+ log.debug(f"raw_H = {raw_H}")
+ log.debug(f"raw_W = {raw_W}")
+ log.debug(f"target_H = {image_gen_height}")
+ log.debug(f"target_W = {image_gen_width}")
+ log.debug(f"estimation = {estimation}")
+
+ return (int(np.round(estimation)), )
+
+class HintImageEnchance:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "hint_image": ("IMAGE", ),
+ "image_gen_width": ("INT", {"default": 512, "min": 64, "max": MAX_IMAGEGEN_RESOLUTION, "step": 8}),
+ "image_gen_height": ("INT", {"default": 512, "min": 64, "max": MAX_IMAGEGEN_RESOLUTION, "step": 8}),
+ #https://github.com/comfyanonymous/ComfyUI/blob/c910b4a01ca58b04e5d4ab4c747680b996ada02b/nodes.py#L854
+ "resize_mode": (RESIZE_MODES, {"default": ResizeMode.RESIZE.value})
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors"
+ def execute(self, hint_image, image_gen_width, image_gen_height, resize_mode):
+ outs = []
+ for single_hint_image in hint_image:
+ np_hint_image = np.asarray(single_hint_image * 255., dtype=np.uint8)
+
+ if resize_mode == ResizeMode.RESIZE.value:
+ np_hint_image = self.execute_resize(np_hint_image, image_gen_width, image_gen_height)
+ elif resize_mode == ResizeMode.OUTER_FIT.value:
+ np_hint_image = self.execute_outer_fit(np_hint_image, image_gen_width, image_gen_height)
+ else:
+ np_hint_image = self.execute_inner_fit(np_hint_image, image_gen_width, image_gen_height)
+
+ outs.append(torch.from_numpy(np_hint_image.astype(np.float32) / 255.0))
+
+ return (torch.stack(outs, dim=0),)
+
+ def execute_resize(self, detected_map, w, h):
+ detected_map = self.high_quality_resize(detected_map, (w, h))
+ detected_map = safe_numpy(detected_map)
+ return detected_map
+
+ def execute_outer_fit(self, detected_map, w, h):
+ old_h, old_w, _ = detected_map.shape
+ old_w = float(old_w)
+ old_h = float(old_h)
+ k0 = float(h) / old_h
+ k1 = float(w) / old_w
+ safeint = lambda x: int(np.round(x))
+ k = min(k0, k1)
+
+ borders = np.concatenate([detected_map[0, :, :], detected_map[-1, :, :], detected_map[:, 0, :], detected_map[:, -1, :]], axis=0)
+ high_quality_border_color = np.median(borders, axis=0).astype(detected_map.dtype)
+ if len(high_quality_border_color) == 4:
+ # Inpaint hijack
+ high_quality_border_color[3] = 255
+ high_quality_background = np.tile(high_quality_border_color[None, None], [h, w, 1])
+ detected_map = self.high_quality_resize(detected_map, (safeint(old_w * k), safeint(old_h * k)))
+ new_h, new_w, _ = detected_map.shape
+ pad_h = max(0, (h - new_h) // 2)
+ pad_w = max(0, (w - new_w) // 2)
+ high_quality_background[pad_h:pad_h + new_h, pad_w:pad_w + new_w] = detected_map
+ detected_map = high_quality_background
+ detected_map = safe_numpy(detected_map)
+ return detected_map
+
+ def execute_inner_fit(self, detected_map, w, h):
+ old_h, old_w, _ = detected_map.shape
+ old_w = float(old_w)
+ old_h = float(old_h)
+ k0 = float(h) / old_h
+ k1 = float(w) / old_w
+ safeint = lambda x: int(np.round(x))
+ k = max(k0, k1)
+
+ detected_map = self.high_quality_resize(detected_map, (safeint(old_w * k), safeint(old_h * k)))
+ new_h, new_w, _ = detected_map.shape
+ pad_h = max(0, (new_h - h) // 2)
+ pad_w = max(0, (new_w - w) // 2)
+ detected_map = detected_map[pad_h:pad_h+h, pad_w:pad_w+w]
+ detected_map = safe_numpy(detected_map)
+ return detected_map
+
+ def high_quality_resize(self, x, size):
+ # Written by lvmin
+ # Super high-quality control map up-scaling, considering binary, seg, and one-pixel edges
+
+ inpaint_mask = None
+ if x.ndim == 3 and x.shape[2] == 4:
+ inpaint_mask = x[:, :, 3]
+ x = x[:, :, 0:3]
+
+ if x.shape[0] != size[1] or x.shape[1] != size[0]:
+ new_size_is_smaller = (size[0] * size[1]) < (x.shape[0] * x.shape[1])
+ new_size_is_bigger = (size[0] * size[1]) > (x.shape[0] * x.shape[1])
+ unique_color_count = len(get_unique_axis0(x.reshape(-1, x.shape[2])))
+ is_one_pixel_edge = False
+ is_binary = False
+ if unique_color_count == 2:
+ is_binary = np.min(x) < 16 and np.max(x) > 240
+ if is_binary:
+ xc = x
+ xc = cv2.erode(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
+ xc = cv2.dilate(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
+ one_pixel_edge_count = np.where(xc < x)[0].shape[0]
+ all_edge_count = np.where(x > 127)[0].shape[0]
+ is_one_pixel_edge = one_pixel_edge_count * 2 > all_edge_count
+
+ if 2 < unique_color_count < 200:
+ interpolation = cv2.INTER_NEAREST
+ elif new_size_is_smaller:
+ interpolation = cv2.INTER_AREA
+ else:
+ interpolation = cv2.INTER_CUBIC # Must be CUBIC because we now use nms. NEVER CHANGE THIS
+
+ y = cv2.resize(x, size, interpolation=interpolation)
+ if inpaint_mask is not None:
+ inpaint_mask = cv2.resize(inpaint_mask, size, interpolation=interpolation)
+
+ if is_binary:
+ y = np.mean(y.astype(np.float32), axis=2).clip(0, 255).astype(np.uint8)
+ if is_one_pixel_edge:
+ y = nake_nms(y)
+ _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+ y = lvmin_thin(y, prunings=new_size_is_bigger)
+ else:
+ _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+ y = np.stack([y] * 3, axis=2)
+ else:
+ y = x
+
+ if inpaint_mask is not None:
+ inpaint_mask = (inpaint_mask > 127).astype(np.float32) * 255.0
+ inpaint_mask = inpaint_mask[:, :, None].clip(0, 255).astype(np.uint8)
+ y = np.concatenate([y, inpaint_mask], axis=2)
+
+ return y
+
+
+class ImageGenResolutionFromLatent:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": { "latent": ("LATENT", ) }
+ }
+
+ RETURN_TYPES = ("INT", "INT")
+ RETURN_NAMES = ("IMAGE_GEN_WIDTH (INT)", "IMAGE_GEN_HEIGHT (INT)")
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors"
+
+ def execute(self, latent):
+ _, _, H, W = latent["samples"].shape
+ return (W * 8, H * 8)
+
+class ImageGenResolutionFromImage:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": { "image": ("IMAGE", ) }
+ }
+
+ RETURN_TYPES = ("INT", "INT")
+ RETURN_NAMES = ("IMAGE_GEN_WIDTH (INT)", "IMAGE_GEN_HEIGHT (INT)")
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors"
+
+ def execute(self, image):
+ _, H, W, _ = image.shape
+ return (W, H)
+
+NODE_CLASS_MAPPINGS = {
+ "PixelPerfectResolution": PixelPerfectResolution,
+ "ImageGenResolutionFromImage": ImageGenResolutionFromImage,
+ "ImageGenResolutionFromLatent": ImageGenResolutionFromLatent,
+ "HintImageEnchance": HintImageEnchance
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "PixelPerfectResolution": "Pixel Perfect Resolution",
+ "ImageGenResolutionFromImage": "Generation Resolution From Image",
+ "ImageGenResolutionFromLatent": "Generation Resolution From Latent",
+ "HintImageEnchance": "Enchance And Resize Hint Images"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/install.bat b/comfyui_controlnet_aux/install.bat
new file mode 100644
index 0000000000000000000000000000000000000000..15242549a498254ee78c6b00d390ee9d9848cbb8
--- /dev/null
+++ b/comfyui_controlnet_aux/install.bat
@@ -0,0 +1,20 @@
+@echo off
+
+set "requirements_txt=%~dp0\requirements.txt"
+set "python_exec=..\..\..\python_embeded\python.exe"
+
+echo Installing ComfyUI's ControlNet Auxiliary Preprocessors..
+
+if exist "%python_exec%" (
+ echo Installing with ComfyUI Portable
+ for /f "delims=" %%i in (%requirements_txt%) do (
+ %python_exec% -s -m pip install "%%i"
+ )
+) else (
+ echo Installing with system Python
+ for /f "delims=" %%i in (%requirements_txt%) do (
+ pip install "%%i"
+ )
+)
+
+pause
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/log.py b/comfyui_controlnet_aux/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..19e51e070a165c2fdde7998b1e4f0e9ce11757d3
--- /dev/null
+++ b/comfyui_controlnet_aux/log.py
@@ -0,0 +1,80 @@
+#Cre: https://github.com/melMass/comfy_mtb/blob/main/log.py
+import logging
+import re
+import os
+
+base_log_level = logging.INFO
+
+
+# Custom object that discards the output
+class NullWriter:
+ def write(self, text):
+ pass
+
+
+class Formatter(logging.Formatter):
+ grey = "\x1b[38;20m"
+ cyan = "\x1b[36;20m"
+ purple = "\x1b[35;20m"
+ yellow = "\x1b[33;20m"
+ red = "\x1b[31;20m"
+ bold_red = "\x1b[31;1m"
+ reset = "\x1b[0m"
+ # format = "%(asctime)s - [%(name)s] - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"
+ format = "[%(name)s] | %(levelname)s -> %(message)s"
+
+ FORMATS = {
+ logging.DEBUG: purple + format + reset,
+ logging.INFO: cyan + format + reset,
+ logging.WARNING: yellow + format + reset,
+ logging.ERROR: red + format + reset,
+ logging.CRITICAL: bold_red + format + reset,
+ }
+
+ def format(self, record):
+ log_fmt = self.FORMATS.get(record.levelno)
+ formatter = logging.Formatter(log_fmt)
+ return formatter.format(record)
+
+
+def mklog(name, level=base_log_level):
+ logger = logging.getLogger(name)
+ logger.setLevel(level)
+
+ for handler in logger.handlers:
+ logger.removeHandler(handler)
+
+ ch = logging.StreamHandler()
+ ch.setLevel(level)
+ ch.setFormatter(Formatter())
+ logger.addHandler(ch)
+
+ # Disable log propagation
+ logger.propagate = False
+
+ return logger
+
+
+# - The main app logger
+log = mklog(__package__, base_log_level)
+
+
+def log_user(arg):
+ print("\033[34mComfyUI ControlNet AUX:\033[0m {arg}")
+
+
+def get_summary(docstring):
+ return docstring.strip().split("\n\n", 1)[0]
+
+
+def blue_text(text):
+ return f"\033[94m{text}\033[0m"
+
+
+def cyan_text(text):
+ return f"\033[96m{text}\033[0m"
+
+
+def get_label(label):
+ words = re.findall(r"(?:^|[A-Z])[a-z]*", label)
+ return " ".join(words).strip()
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/lvminthin.py b/comfyui_controlnet_aux/lvminthin.py
new file mode 100644
index 0000000000000000000000000000000000000000..f512a46bdce9ed461d1e1d61331f9819053e4edc
--- /dev/null
+++ b/comfyui_controlnet_aux/lvminthin.py
@@ -0,0 +1,87 @@
+# High Quality Edge Thinning using Pure Python
+# Written by Lvmin Zhang
+# 2023 April
+# Stanford University
+# If you use this, please Cite "High Quality Edge Thinning using Pure Python", Lvmin Zhang, In Mikubill/sd-webui-controlnet.
+
+
+import cv2
+import numpy as np
+
+
+lvmin_kernels_raw = [
+ np.array([
+ [-1, -1, -1],
+ [0, 1, 0],
+ [1, 1, 1]
+ ], dtype=np.int32),
+ np.array([
+ [0, -1, -1],
+ [1, 1, -1],
+ [0, 1, 0]
+ ], dtype=np.int32)
+]
+
+lvmin_kernels = []
+lvmin_kernels += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_kernels_raw]
+lvmin_kernels += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_kernels_raw]
+lvmin_kernels += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_kernels_raw]
+lvmin_kernels += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_kernels_raw]
+
+lvmin_prunings_raw = [
+ np.array([
+ [-1, -1, -1],
+ [-1, 1, -1],
+ [0, 0, -1]
+ ], dtype=np.int32),
+ np.array([
+ [-1, -1, -1],
+ [-1, 1, -1],
+ [-1, 0, 0]
+ ], dtype=np.int32)
+]
+
+lvmin_prunings = []
+lvmin_prunings += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_prunings_raw]
+lvmin_prunings += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_prunings_raw]
+lvmin_prunings += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_prunings_raw]
+lvmin_prunings += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_prunings_raw]
+
+
+def remove_pattern(x, kernel):
+ objects = cv2.morphologyEx(x, cv2.MORPH_HITMISS, kernel)
+ objects = np.where(objects > 127)
+ x[objects] = 0
+ return x, objects[0].shape[0] > 0
+
+
+def thin_one_time(x, kernels):
+ y = x
+ is_done = True
+ for k in kernels:
+ y, has_update = remove_pattern(y, k)
+ if has_update:
+ is_done = False
+ return y, is_done
+
+
+def lvmin_thin(x, prunings=True):
+ y = x
+ for i in range(32):
+ y, is_done = thin_one_time(y, lvmin_kernels)
+ if is_done:
+ break
+ if prunings:
+ y, _ = thin_one_time(y, lvmin_prunings)
+ return y
+
+
+def nake_nms(x):
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+ y = np.zeros_like(x)
+ for f in [f1, f2, f3, f4]:
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+ return y
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/anime_face_segment.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/anime_face_segment.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5b0d94ef5a2e5bdc0576c9bdef7d00b737ad72d
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/anime_face_segment.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/anyline.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/anyline.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d13c2714198e1317e00471e6cbefbe5de5c660c
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/anyline.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/binary.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/binary.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..933b4b314f2111144be85658f3fa3a68009cd785
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/binary.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/canny.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/canny.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c0333cc60d6cbd86f5cc72a463ce8be0d51a95b
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/canny.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/color.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/color.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3258317d998743c2f2ab4e16f6eb2b598a187cb
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/color.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/densepose.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/densepose.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b61b8262cb525d26837b9ea69e965707a1e9a169
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/densepose.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/depth_anything.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/depth_anything.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef01f8919f2269d6b7738f9afcb770f043a23666
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/depth_anything.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/depth_anything_v2.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/depth_anything_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd95e08ddd03d06a41a6ece9f3345d64ff1f9de5
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/depth_anything_v2.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/diffusion_edge.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/diffusion_edge.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d866c6980146cfaaadee67aa08754bcc498281ec
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/diffusion_edge.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/dsine.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/dsine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69242a935be2a6e66ad40bbaa664c83bd18acf88
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/dsine.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/dwpose.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/dwpose.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..695c0dcdac43bf2ec2f197b7c75e01e51ab454c6
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/dwpose.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/hed.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/hed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c15baf22a99463bfef4969af0bb3dd0558b8fa18
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/hed.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/inpaint.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/inpaint.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7eea9c378d50ea7a0d5436b303595f08314ea3b
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/inpaint.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/leres.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/leres.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2b4277d36df8e1739d0321102662f16a8e3a418
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/leres.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..919b85b3a06e7c270a1b5abd45ee9a022642ccfe
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart_anime.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart_anime.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66654aaa9025a6cfca65b81cdc1ba7ffb6eeeff5
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart_anime.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart_standard.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart_standard.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26d7caa0337bb9b491525a3ea38b5c3b503ab647
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/lineart_standard.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/manga_line.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/manga_line.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfca805601073f059784d33f2cee50dc70a2ac83
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/manga_line.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/mediapipe_face.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/mediapipe_face.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21241574cbe336e7b2d2213926d14e3a34939045
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/mediapipe_face.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/mesh_graphormer.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/mesh_graphormer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2517358667c7a587058f2b6cdf5b7e57c49c195e
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/mesh_graphormer.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/metric3d.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/metric3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80193ba4f3f5196fff5ca485c04e916fc3901ef6
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/metric3d.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/midas.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/midas.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cccc58b67035d0f2b014feb1a042a1dc8d55167
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/midas.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/mlsd.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/mlsd.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a2cb33941a851b5c8dd4fcbf558fb059304209c
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/mlsd.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/normalbae.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/normalbae.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee6d96d991467cf70ec8fc4d2640fa4d42974398
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/normalbae.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/oneformer.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/oneformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4861f3a1714a70ca27ed74b7669d0eb909422775
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/oneformer.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/openpose.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/openpose.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a385d6097c69886ff0cd18225d0cc038a8e3693a
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/openpose.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/pidinet.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/pidinet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eae318d4ccc5eeb3375be8861ad09022e97f5c4
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/pidinet.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/pose_keypoint_postprocess.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/pose_keypoint_postprocess.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11b8e33c9b301917115abc89256694ea69f67a09
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/pose_keypoint_postprocess.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/pyracanny.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/pyracanny.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5577ecd41991538e67f587c7e2a5319974890ee
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/pyracanny.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/recolor.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/recolor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..beca3ee562ab5f55f496697e81dbe88fcf05a2cf
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/recolor.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/scribble.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/scribble.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac3a5c60e54bd526f593f7b933a342eeb6cb7b22
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/scribble.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/segment_anything.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/segment_anything.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88505c30160062718f5a98cb9be5b1ddbecdc0a1
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/segment_anything.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/shuffle.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/shuffle.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5291577de89097f5899cf7668a976dea2e1bde1a
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/shuffle.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/teed.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/teed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09f6a489ddfb5625599f4585473b624e6495b731
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/teed.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/tile.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/tile.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cdd47d5532fef46a43905e6236ee416506b1a6f
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/tile.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/uniformer.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/uniformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19747f0b776b5cac7bd0795476dda0ccc55ebe55
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/uniformer.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/unimatch.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/unimatch.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51444d1ddac88317923dab2a8e44fecd2b13e5c7
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/unimatch.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/__pycache__/zoe.cpython-312.pyc b/comfyui_controlnet_aux/node_wrappers/__pycache__/zoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6960af307390794af2fa96bf9059cf9c85c7995
Binary files /dev/null and b/comfyui_controlnet_aux/node_wrappers/__pycache__/zoe.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/node_wrappers/anime_face_segment.py b/comfyui_controlnet_aux/node_wrappers/anime_face_segment.py
new file mode 100644
index 0000000000000000000000000000000000000000..869548d1ae47fec4bb914e10422011c8ec5d3390
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/anime_face_segment.py
@@ -0,0 +1,43 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+import torch
+from einops import rearrange
+
+class AnimeFace_SemSegPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ #This preprocessor is only trained on 512x resolution
+ #https://github.com/siyeong0/Anime-Face-Segmentation/blob/main/predict.py#L25
+ return define_preprocessor_inputs(
+ remove_background_using_abg=INPUT.BOOLEAN(True),
+ resolution=INPUT.RESOLUTION(default=512, min=512, max=512)
+ )
+
+ RETURN_TYPES = ("IMAGE", "MASK")
+ RETURN_NAMES = ("IMAGE", "ABG_CHARACTER_MASK (MASK)")
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Semantic Segmentation"
+
+ def execute(self, image, remove_background_using_abg=True, resolution=512, **kwargs):
+ from custom_controlnet_aux.anime_face_segment import AnimeFaceSegmentor
+
+ model = AnimeFaceSegmentor.from_pretrained().to(model_management.get_torch_device())
+ if remove_background_using_abg:
+ out_image_with_mask = common_annotator_call(model, image, resolution=resolution, remove_background=True)
+ out_image = out_image_with_mask[..., :3]
+ mask = out_image_with_mask[..., 3:]
+ mask = rearrange(mask, "n h w c -> n c h w")
+ else:
+ out_image = common_annotator_call(model, image, resolution=resolution, remove_background=False)
+ N, H, W, C = out_image.shape
+ mask = torch.ones(N, C, H, W)
+ del model
+ return (out_image, mask)
+
+NODE_CLASS_MAPPINGS = {
+ "AnimeFace_SemSegPreprocessor": AnimeFace_SemSegPreprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "AnimeFace_SemSegPreprocessor": "Anime Face Segmentor"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/anyline.py b/comfyui_controlnet_aux/node_wrappers/anyline.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b008a390b307a2e2826e3c1e7d5e6c62051d46
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/anyline.py
@@ -0,0 +1,87 @@
+import torch
+import numpy as np
+import comfy.model_management as model_management
+import comfy.utils
+
+# Requires comfyui_controlnet_aux funcsions and classes
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+
+def get_intensity_mask(image_array, lower_bound, upper_bound):
+ mask = image_array[:, :, 0]
+ mask = np.where((mask >= lower_bound) & (mask <= upper_bound), mask, 0)
+ mask = np.expand_dims(mask, 2).repeat(3, axis=2)
+ return mask
+
+def combine_layers(base_layer, top_layer):
+ mask = top_layer.astype(bool)
+ temp = 1 - (1 - top_layer) * (1 - base_layer)
+ result = base_layer * (~mask) + temp * mask
+ return result
+
+class AnyLinePreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ merge_with_lineart=INPUT.COMBO(["lineart_standard", "lineart_realisitic", "lineart_anime", "manga_line"], default="lineart_standard"),
+ resolution=INPUT.RESOLUTION(default=1280, step=8),
+ lineart_lower_bound=INPUT.FLOAT(default=0),
+ lineart_upper_bound=INPUT.FLOAT(default=1),
+ object_min_size=INPUT.INT(default=36, min=1),
+ object_connectivity=INPUT.INT(default=1, min=1)
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ RETURN_NAMES = ("image",)
+
+ FUNCTION = "get_anyline"
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def __init__(self):
+ self.device = model_management.get_torch_device()
+
+ def get_anyline(self, image, merge_with_lineart="lineart_standard", resolution=512, lineart_lower_bound=0, lineart_upper_bound=1, object_min_size=36, object_connectivity=1):
+ from custom_controlnet_aux.teed import TEDDetector
+ from skimage import morphology
+ pbar = comfy.utils.ProgressBar(3)
+
+ # Process the image with MTEED model
+ mteed_model = TEDDetector.from_pretrained("TheMistoAI/MistoLine", "MTEED.pth", subfolder="Anyline").to(self.device)
+ mteed_result = common_annotator_call(mteed_model, image, resolution=resolution, show_pbar=False)
+ mteed_result = mteed_result.numpy()
+ del mteed_model
+ pbar.update(1)
+
+ # Process the image with the lineart standard preprocessor
+ if merge_with_lineart == "lineart_standard":
+ from custom_controlnet_aux.lineart_standard import LineartStandardDetector
+ lineart_standard_detector = LineartStandardDetector()
+ lineart_result = common_annotator_call(lineart_standard_detector, image, guassian_sigma=2, intensity_threshold=3, resolution=resolution, show_pbar=False).numpy()
+ del lineart_standard_detector
+ else:
+ from custom_controlnet_aux.lineart import LineartDetector
+ from custom_controlnet_aux.lineart_anime import LineartAnimeDetector
+ from custom_controlnet_aux.manga_line import LineartMangaDetector
+ lineart_detector = dict(lineart_realisitic=LineartDetector, lineart_anime=LineartAnimeDetector, manga_line=LineartMangaDetector)[merge_with_lineart]
+ lineart_detector = lineart_detector.from_pretrained().to(self.device)
+ lineart_result = common_annotator_call(lineart_detector, image, resolution=resolution, show_pbar=False).numpy()
+ del lineart_detector
+ pbar.update(1)
+
+ final_result = []
+ for i in range(len(image)):
+ _lineart_result = get_intensity_mask(lineart_result[i], lower_bound=lineart_lower_bound, upper_bound=lineart_upper_bound)
+ _cleaned = morphology.remove_small_objects(_lineart_result.astype(bool), min_size=object_min_size, connectivity=object_connectivity)
+ _lineart_result = _lineart_result * _cleaned
+ _mteed_result = mteed_result[i]
+
+ # Combine the results
+ final_result.append(torch.from_numpy(combine_layers(_mteed_result, _lineart_result)))
+ pbar.update(1)
+ return (torch.stack(final_result),)
+
+NODE_CLASS_MAPPINGS = {
+ "AnyLineArtPreprocessor_aux": AnyLinePreprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "AnyLineArtPreprocessor_aux": "AnyLine Lineart"
+}
diff --git a/comfyui_controlnet_aux/node_wrappers/binary.py b/comfyui_controlnet_aux/node_wrappers/binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2ee277fe7c807f72f873b8b29d45a91fe8a458
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/binary.py
@@ -0,0 +1,29 @@
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+import comfy.model_management as model_management
+
+class Binary_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ bin_threshold=INPUT.INT(default=100, max=255),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, bin_threshold=100, resolution=512, **kwargs):
+ from custom_controlnet_aux.binary import BinaryDetector
+
+ return (common_annotator_call(BinaryDetector(), image, bin_threshold=bin_threshold, resolution=resolution), )
+
+
+
+NODE_CLASS_MAPPINGS = {
+ "BinaryPreprocessor": Binary_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "BinaryPreprocessor": "Binary Lines"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/canny.py b/comfyui_controlnet_aux/node_wrappers/canny.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbeeb63a5c0d26d40291cbf5db949e3c395c203
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/canny.py
@@ -0,0 +1,30 @@
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+import comfy.model_management as model_management
+
+class Canny_Edge_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ low_threshold=INPUT.INT(default=100, max=255),
+ high_threshold=INPUT.INT(default=200, max=255),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, low_threshold=100, high_threshold=200, resolution=512, **kwargs):
+ from custom_controlnet_aux.canny import CannyDetector
+
+ return (common_annotator_call(CannyDetector(), image, low_threshold=low_threshold, high_threshold=high_threshold, resolution=resolution), )
+
+
+
+NODE_CLASS_MAPPINGS = {
+ "CannyEdgePreprocessor": Canny_Edge_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "CannyEdgePreprocessor": "Canny Edge"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/color.py b/comfyui_controlnet_aux/node_wrappers/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f887959ac94aa2ec7ee7f3e908bc1c1e7bf046
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/color.py
@@ -0,0 +1,26 @@
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+import comfy.model_management as model_management
+
+class Color_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/T2IAdapter-only"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.color import ColorDetector
+
+ return (common_annotator_call(ColorDetector(), image, resolution=resolution), )
+
+
+
+NODE_CLASS_MAPPINGS = {
+ "ColorPreprocessor": Color_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "ColorPreprocessor": "Color Pallete"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/densepose.py b/comfyui_controlnet_aux/node_wrappers/densepose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d13bac907d220abc24339a975c92cc0a161f7989
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/densepose.py
@@ -0,0 +1,31 @@
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+import comfy.model_management as model_management
+
+class DensePose_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ model=INPUT.COMBO(["densepose_r50_fpn_dl.torchscript", "densepose_r101_fpn_dl.torchscript"]),
+ cmap=INPUT.COMBO(["Viridis (MagicAnimate)", "Parula (CivitAI)"]),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Faces and Poses Estimators"
+
+ def execute(self, image, model="densepose_r50_fpn_dl.torchscript", cmap="Viridis (MagicAnimate)", resolution=512):
+ from custom_controlnet_aux.densepose import DenseposeDetector
+ model = DenseposeDetector \
+ .from_pretrained(filename=model) \
+ .to(model_management.get_torch_device())
+ return (common_annotator_call(model, image, cmap="viridis" if "Viridis" in cmap else "parula", resolution=resolution), )
+
+
+NODE_CLASS_MAPPINGS = {
+ "DensePosePreprocessor": DensePose_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "DensePosePreprocessor": "DensePose Estimator"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/depth_anything.py b/comfyui_controlnet_aux/node_wrappers/depth_anything.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a87eca2d419fdf219d323a0d3459616d4fa316
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/depth_anything.py
@@ -0,0 +1,55 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class Depth_Anything_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ ckpt_name=INPUT.COMBO(
+ ["depth_anything_vitl14.pth", "depth_anything_vitb14.pth", "depth_anything_vits14.pth"]
+ ),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, ckpt_name="depth_anything_vitl14.pth", resolution=512, **kwargs):
+ from custom_controlnet_aux.depth_anything import DepthAnythingDetector
+
+ model = DepthAnythingDetector.from_pretrained(filename=ckpt_name).to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out, )
+
+class Zoe_Depth_Anything_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ environment=INPUT.COMBO(["indoor", "outdoor"]),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, environment="indoor", resolution=512, **kwargs):
+ from custom_controlnet_aux.zoe import ZoeDepthAnythingDetector
+ ckpt_name = "depth_anything_metric_depth_indoor.pt" if environment == "indoor" else "depth_anything_metric_depth_outdoor.pt"
+ model = ZoeDepthAnythingDetector.from_pretrained(filename=ckpt_name).to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "DepthAnythingPreprocessor": Depth_Anything_Preprocessor,
+ "Zoe_DepthAnythingPreprocessor": Zoe_Depth_Anything_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "DepthAnythingPreprocessor": "Depth Anything",
+ "Zoe_DepthAnythingPreprocessor": "Zoe Depth Anything"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/depth_anything_v2.py b/comfyui_controlnet_aux/node_wrappers/depth_anything_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e6a65905d27ecb0e5b2a9f7bd7846c39f55b107
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/depth_anything_v2.py
@@ -0,0 +1,56 @@
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+import comfy.model_management as model_management
+
+class Depth_Anything_V2_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ ckpt_name=INPUT.COMBO(
+ ["depth_anything_v2_vitg.pth", "depth_anything_v2_vitl.pth", "depth_anything_v2_vitb.pth", "depth_anything_v2_vits.pth"],
+ default="depth_anything_v2_vitl.pth"
+ ),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, ckpt_name="depth_anything_v2_vitl.pth", resolution=512, **kwargs):
+ from custom_controlnet_aux.depth_anything_v2 import DepthAnythingV2Detector
+
+ model = DepthAnythingV2Detector.from_pretrained(filename=ckpt_name).to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, max_depth=1)
+ del model
+ return (out, )
+
+""" class Depth_Anything_Metric_V2_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return create_node_input_types(
+ environment=(["indoor", "outdoor"], {"default": "indoor"}),
+ max_depth=("FLOAT", {"min": 0, "max": 100, "default": 20.0, "step": 0.01})
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, environment, resolution=512, max_depth=20.0, **kwargs):
+ from custom_controlnet_aux.depth_anything_v2 import DepthAnythingV2Detector
+ filename = dict(indoor="depth_anything_v2_metric_hypersim_vitl.pth", outdoor="depth_anything_v2_metric_vkitti_vitl.pth")[environment]
+ model = DepthAnythingV2Detector.from_pretrained(filename=filename).to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, max_depth=max_depth)
+ del model
+ return (out, ) """
+
+NODE_CLASS_MAPPINGS = {
+ "DepthAnythingV2Preprocessor": Depth_Anything_V2_Preprocessor,
+ #"Metric_DepthAnythingV2Preprocessor": Depth_Anything_Metric_V2_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "DepthAnythingV2Preprocessor": "Depth Anything V2 - Relative",
+ #"Metric_DepthAnythingV2Preprocessor": "Depth Anything V2 - Metric"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/diffusion_edge.py b/comfyui_controlnet_aux/node_wrappers/diffusion_edge.py
new file mode 100644
index 0000000000000000000000000000000000000000..3026083b288d42becee7435c628bb26d711db406
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/diffusion_edge.py
@@ -0,0 +1,41 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT, run_script
+import comfy.model_management as model_management
+import sys
+
+def install_deps():
+ try:
+ import sklearn
+ except:
+ run_script([sys.executable, '-s', '-m', 'pip', 'install', 'scikit-learn'])
+
+class DiffusionEdge_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ environment=INPUT.COMBO(["indoor", "urban", "natrual"]),
+ patch_batch_size=INPUT.INT(default=4, min=1, max=16),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, environment="indoor", patch_batch_size=4, resolution=512, **kwargs):
+ install_deps()
+ from custom_controlnet_aux.diffusion_edge import DiffusionEdgeDetector
+
+ model = DiffusionEdgeDetector \
+ .from_pretrained(filename = f"diffusion_edge_{environment}.pt") \
+ .to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, patch_batch_size=patch_batch_size)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "DiffusionEdge_Preprocessor": DiffusionEdge_Preprocessor,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "DiffusionEdge_Preprocessor": "Diffusion Edge (batch size ↑ => speed ↑, VRAM ↑)",
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/dsine.py b/comfyui_controlnet_aux/node_wrappers/dsine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c1b04fe2d1bcedfb71164fcc9cd7e8edf7f693
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/dsine.py
@@ -0,0 +1,31 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class DSINE_Normal_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ fov=INPUT.FLOAT(max=365.0, default=60.0),
+ iterations=INPUT.INT(min=1, max=20, default=5),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, fov=60.0, iterations=5, resolution=512, **kwargs):
+ from custom_controlnet_aux.dsine import DsineDetector
+
+ model = DsineDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, fov=fov, iterations=iterations, resolution=resolution)
+ del model
+ return (out,)
+
+NODE_CLASS_MAPPINGS = {
+ "DSINE-NormalMapPreprocessor": DSINE_Normal_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "DSINE-NormalMapPreprocessor": "DSINE Normal Map"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/dwpose.py b/comfyui_controlnet_aux/node_wrappers/dwpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7c06938bb722ed026e0b39259cf40e9f8b7cac
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/dwpose.py
@@ -0,0 +1,162 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+import numpy as np
+import warnings
+from custom_controlnet_aux.dwpose import DwposeDetector, AnimalposeDetector
+import os
+import json
+
+DWPOSE_MODEL_NAME = "yzd-v/DWPose"
+#Trigger startup caching for onnxruntime
+GPU_PROVIDERS = ["CUDAExecutionProvider", "DirectMLExecutionProvider", "OpenVINOExecutionProvider", "ROCMExecutionProvider", "CoreMLExecutionProvider"]
+def check_ort_gpu():
+ try:
+ import onnxruntime as ort
+ for provider in GPU_PROVIDERS:
+ if provider in ort.get_available_providers():
+ return True
+ return False
+ except:
+ return False
+
+if not os.environ.get("DWPOSE_ONNXRT_CHECKED"):
+ if check_ort_gpu():
+ print("DWPose: Onnxruntime with acceleration providers detected")
+ else:
+ warnings.warn("DWPose: Onnxruntime not found or doesn't come with acceleration providers, switch to OpenCV with CPU device. DWPose might run very slowly")
+ os.environ['AUX_ORT_PROVIDERS'] = ''
+ os.environ["DWPOSE_ONNXRT_CHECKED"] = '1'
+
+class DWPose_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ detect_hand=INPUT.COMBO(["enable", "disable"]),
+ detect_body=INPUT.COMBO(["enable", "disable"]),
+ detect_face=INPUT.COMBO(["enable", "disable"]),
+ resolution=INPUT.RESOLUTION(),
+ bbox_detector=INPUT.COMBO(
+ ["yolox_l.torchscript.pt", "yolox_l.onnx", "yolo_nas_l_fp16.onnx", "yolo_nas_m_fp16.onnx", "yolo_nas_s_fp16.onnx"],
+ default="yolox_l.onnx"
+ ),
+ pose_estimator=INPUT.COMBO(
+ ["dw-ll_ucoco_384_bs5.torchscript.pt", "dw-ll_ucoco_384.onnx", "dw-ll_ucoco.onnx"],
+ default="dw-ll_ucoco_384_bs5.torchscript.pt"
+ ),
+ scale_stick_for_xinsr_cn=INPUT.COMBO(["disable", "enable"])
+ )
+
+ RETURN_TYPES = ("IMAGE", "POSE_KEYPOINT")
+ FUNCTION = "estimate_pose"
+
+ CATEGORY = "ControlNet Preprocessors/Faces and Poses Estimators"
+
+ def estimate_pose(self, image, detect_hand="enable", detect_body="enable", detect_face="enable", resolution=512, bbox_detector="yolox_l.onnx", pose_estimator="dw-ll_ucoco_384.onnx", scale_stick_for_xinsr_cn="disable", **kwargs):
+ if bbox_detector == "yolox_l.onnx":
+ yolo_repo = DWPOSE_MODEL_NAME
+ elif "yolox" in bbox_detector:
+ yolo_repo = "hr16/yolox-onnx"
+ elif "yolo_nas" in bbox_detector:
+ yolo_repo = "hr16/yolo-nas-fp16"
+ else:
+ raise NotImplementedError(f"Download mechanism for {bbox_detector}")
+
+ if pose_estimator == "dw-ll_ucoco_384.onnx":
+ pose_repo = DWPOSE_MODEL_NAME
+ elif pose_estimator.endswith(".onnx"):
+ pose_repo = "hr16/UnJIT-DWPose"
+ elif pose_estimator.endswith(".torchscript.pt"):
+ pose_repo = "hr16/DWPose-TorchScript-BatchSize5"
+ else:
+ raise NotImplementedError(f"Download mechanism for {pose_estimator}")
+
+ model = DwposeDetector.from_pretrained(
+ pose_repo,
+ yolo_repo,
+ det_filename=bbox_detector, pose_filename=pose_estimator,
+ torchscript_device=model_management.get_torch_device()
+ )
+ detect_hand = detect_hand == "enable"
+ detect_body = detect_body == "enable"
+ detect_face = detect_face == "enable"
+ scale_stick_for_xinsr_cn = scale_stick_for_xinsr_cn == "enable"
+ self.openpose_dicts = []
+ def func(image, **kwargs):
+ pose_img, openpose_dict = model(image, **kwargs)
+ self.openpose_dicts.append(openpose_dict)
+ return pose_img
+
+ out = common_annotator_call(func, image, include_hand=detect_hand, include_face=detect_face, include_body=detect_body, image_and_json=True, resolution=resolution, xinsr_stick_scaling=scale_stick_for_xinsr_cn)
+ del model
+ return {
+ 'ui': { "openpose_json": [json.dumps(self.openpose_dicts, indent=4)] },
+ "result": (out, self.openpose_dicts)
+ }
+
+class AnimalPose_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ bbox_detector = INPUT.COMBO(
+ ["yolox_l.torchscript.pt", "yolox_l.onnx", "yolo_nas_l_fp16.onnx", "yolo_nas_m_fp16.onnx", "yolo_nas_s_fp16.onnx"],
+ default="yolox_l.torchscript.pt"
+ ),
+ pose_estimator = INPUT.COMBO(
+ ["rtmpose-m_ap10k_256_bs5.torchscript.pt", "rtmpose-m_ap10k_256.onnx"],
+ default="rtmpose-m_ap10k_256_bs5.torchscript.pt"
+ ),
+ resolution = INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE", "POSE_KEYPOINT")
+ FUNCTION = "estimate_pose"
+
+ CATEGORY = "ControlNet Preprocessors/Faces and Poses Estimators"
+
+ def estimate_pose(self, image, resolution=512, bbox_detector="yolox_l.onnx", pose_estimator="rtmpose-m_ap10k_256.onnx", **kwargs):
+ if bbox_detector == "yolox_l.onnx":
+ yolo_repo = DWPOSE_MODEL_NAME
+ elif "yolox" in bbox_detector:
+ yolo_repo = "hr16/yolox-onnx"
+ elif "yolo_nas" in bbox_detector:
+ yolo_repo = "hr16/yolo-nas-fp16"
+ else:
+ raise NotImplementedError(f"Download mechanism for {bbox_detector}")
+
+ if pose_estimator == "dw-ll_ucoco_384.onnx":
+ pose_repo = DWPOSE_MODEL_NAME
+ elif pose_estimator.endswith(".onnx"):
+ pose_repo = "hr16/UnJIT-DWPose"
+ elif pose_estimator.endswith(".torchscript.pt"):
+ pose_repo = "hr16/DWPose-TorchScript-BatchSize5"
+ else:
+ raise NotImplementedError(f"Download mechanism for {pose_estimator}")
+
+ model = AnimalposeDetector.from_pretrained(
+ pose_repo,
+ yolo_repo,
+ det_filename=bbox_detector, pose_filename=pose_estimator,
+ torchscript_device=model_management.get_torch_device()
+ )
+
+ self.openpose_dicts = []
+ def func(image, **kwargs):
+ pose_img, openpose_dict = model(image, **kwargs)
+ self.openpose_dicts.append(openpose_dict)
+ return pose_img
+
+ out = common_annotator_call(func, image, image_and_json=True, resolution=resolution)
+ del model
+ return {
+ 'ui': { "openpose_json": [json.dumps(self.openpose_dicts, indent=4)] },
+ "result": (out, self.openpose_dicts)
+ }
+
+NODE_CLASS_MAPPINGS = {
+ "DWPreprocessor": DWPose_Preprocessor,
+ "AnimalPosePreprocessor": AnimalPose_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "DWPreprocessor": "DWPose Estimator",
+ "AnimalPosePreprocessor": "AnimalPose Estimator (AP10K)"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/hed.py b/comfyui_controlnet_aux/node_wrappers/hed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89ba460c05e67f7db4a9c1149383f927a5f95d5
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/hed.py
@@ -0,0 +1,53 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class HED_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ safe=INPUT.COMBO(["enable", "disable"]),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.hed import HEDdetector
+
+ model = HEDdetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, safe = kwargs["safe"] == "enable")
+ del model
+ return (out, )
+
+class Fake_Scribble_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ safe=INPUT.COMBO(["enable", "disable"]),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.hed import HEDdetector
+
+ model = HEDdetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, scribble=True, safe=kwargs["safe"]=="enable")
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "HEDPreprocessor": HED_Preprocessor,
+ "FakeScribblePreprocessor": Fake_Scribble_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "HEDPreprocessor": "HED Soft-Edge Lines",
+ "FakeScribblePreprocessor": "Fake Scribble Lines (aka scribble_hed)"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/inpaint.py b/comfyui_controlnet_aux/node_wrappers/inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f220650f1377efa2d18b9b2ca0d73c8886f7b55
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/inpaint.py
@@ -0,0 +1,32 @@
+import torch
+from ..utils import INPUT
+
+class InpaintPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return dict(
+ required=dict(image=INPUT.IMAGE(), mask=INPUT.MASK()),
+ optional=dict(black_pixel_for_xinsir_cn=INPUT.BOOLEAN(False))
+ )
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "preprocess"
+
+ CATEGORY = "ControlNet Preprocessors/others"
+
+ def preprocess(self, image, mask, black_pixel_for_xinsir_cn=False):
+ mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(image.shape[1], image.shape[2]), mode="bilinear")
+ mask = mask.movedim(1,-1).expand((-1,-1,-1,3))
+ image = image.clone()
+ if black_pixel_for_xinsir_cn:
+ masked_pixel = 0.0
+ else:
+ masked_pixel = -1.0
+ image[mask > 0.5] = masked_pixel
+ return (image,)
+
+NODE_CLASS_MAPPINGS = {
+ "InpaintPreprocessor": InpaintPreprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "InpaintPreprocessor": "Inpaint Preprocessor"
+}
diff --git a/comfyui_controlnet_aux/node_wrappers/leres.py b/comfyui_controlnet_aux/node_wrappers/leres.py
new file mode 100644
index 0000000000000000000000000000000000000000..404f0f77b3c306721365e9f8ef0809ef501721d3
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/leres.py
@@ -0,0 +1,32 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class LERES_Depth_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ rm_nearest=INPUT.FLOAT(max=100.0),
+ rm_background=INPUT.FLOAT(max=100.0),
+ boost=INPUT.COMBO(["disable", "enable"]),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, rm_nearest=0, rm_background=0, resolution=512, boost="disable", **kwargs):
+ from custom_controlnet_aux.leres import LeresDetector
+
+ model = LeresDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, thr_a=rm_nearest, thr_b=rm_background, boost=boost == "enable")
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "LeReS-DepthMapPreprocessor": LERES_Depth_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "LeReS-DepthMapPreprocessor": "LeReS Depth Map (enable boost for leres++)"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/lineart.py b/comfyui_controlnet_aux/node_wrappers/lineart.py
new file mode 100644
index 0000000000000000000000000000000000000000..174dca00b15a82e98f8561669a4f92c8a05803a5
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/lineart.py
@@ -0,0 +1,30 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class LineArt_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ coarse=INPUT.COMBO((["disable", "enable"])),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.lineart import LineartDetector
+
+ model = LineartDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, coarse = kwargs["coarse"] == "enable")
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "LineArtPreprocessor": LineArt_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "LineArtPreprocessor": "Realistic Lineart"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/lineart_anime.py b/comfyui_controlnet_aux/node_wrappers/lineart_anime.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b3cd5595a42ff5790d13dbb5065c1f8063af4f
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/lineart_anime.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class AnimeLineArt_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.lineart_anime import LineartAnimeDetector
+
+ model = LineartAnimeDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "AnimeLineArtPreprocessor": AnimeLineArt_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "AnimeLineArtPreprocessor": "Anime Lineart"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/lineart_standard.py b/comfyui_controlnet_aux/node_wrappers/lineart_standard.py
new file mode 100644
index 0000000000000000000000000000000000000000..74afcd1857d8b9d4c157350ed33f9832a9c3fd94
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/lineart_standard.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class Lineart_Standard_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ guassian_sigma=INPUT.FLOAT(default=6.0, max=100.0),
+ intensity_threshold=INPUT.INT(default=8, max=16),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, guassian_sigma=6, intensity_threshold=8, resolution=512, **kwargs):
+ from custom_controlnet_aux.lineart_standard import LineartStandardDetector
+ return (common_annotator_call(LineartStandardDetector(), image, guassian_sigma=guassian_sigma, intensity_threshold=intensity_threshold, resolution=resolution), )
+
+NODE_CLASS_MAPPINGS = {
+ "LineartStandardPreprocessor": Lineart_Standard_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "LineartStandardPreprocessor": "Standard Lineart"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/manga_line.py b/comfyui_controlnet_aux/node_wrappers/manga_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..4635d05bbd8b785443bdee72a19d61cbedc7c7cb
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/manga_line.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class Manga2Anime_LineArt_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.manga_line import LineartMangaDetector
+
+ model = LineartMangaDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "Manga2Anime_LineArt_Preprocessor": Manga2Anime_LineArt_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "Manga2Anime_LineArt_Preprocessor": "Manga Lineart (aka lineart_anime_denoise)"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/mediapipe_face.py b/comfyui_controlnet_aux/node_wrappers/mediapipe_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..5676ef4df28d027b5280044fc3b7ff66af2f5fa5
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/mediapipe_face.py
@@ -0,0 +1,39 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT, run_script
+import comfy.model_management as model_management
+import os, sys
+import subprocess, threading
+
+def install_deps():
+ try:
+ import mediapipe
+ except ImportError:
+ run_script([sys.executable, '-s', '-m', 'pip', 'install', 'mediapipe'])
+ run_script([sys.executable, '-s', '-m', 'pip', 'install', '--upgrade', 'protobuf'])
+
+class Media_Pipe_Face_Mesh_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ max_faces=INPUT.INT(default=10, min=1, max=50), #Which image has more than 50 detectable faces?
+ min_confidence=INPUT.FLOAT(default=0.5, min=0.1),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "detect"
+
+ CATEGORY = "ControlNet Preprocessors/Faces and Poses Estimators"
+
+ def detect(self, image, max_faces=10, min_confidence=0.5, resolution=512):
+ #Ref: https://github.com/Fannovel16/comfy_controlnet_preprocessors/issues/70#issuecomment-1677967369
+ install_deps()
+ from custom_controlnet_aux.mediapipe_face import MediapipeFaceDetector
+ return (common_annotator_call(MediapipeFaceDetector(), image, max_faces=max_faces, min_confidence=min_confidence, resolution=resolution), )
+
+NODE_CLASS_MAPPINGS = {
+ "MediaPipe-FaceMeshPreprocessor": Media_Pipe_Face_Mesh_Preprocessor
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "MediaPipe-FaceMeshPreprocessor": "MediaPipe Face Mesh"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/mesh_graphormer.py b/comfyui_controlnet_aux/node_wrappers/mesh_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36798a618ca4c73731f17901492e6eeec404230
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/mesh_graphormer.py
@@ -0,0 +1,158 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT, MAX_RESOLUTION, run_script
+import comfy.model_management as model_management
+import numpy as np
+import torch
+from einops import rearrange
+import os, sys
+import subprocess, threading
+import scipy.ndimage
+import cv2
+import torch.nn.functional as F
+
+def install_deps():
+ try:
+ import mediapipe
+ except ImportError:
+ run_script([sys.executable, '-s', '-m', 'pip', 'install', 'mediapipe'])
+ run_script([sys.executable, '-s', '-m', 'pip', 'install', '--upgrade', 'protobuf'])
+
+ try:
+ import trimesh
+ except ImportError:
+ run_script([sys.executable, '-s', '-m', 'pip', 'install', 'trimesh[easy]'])
+
+#Sauce: https://github.com/comfyanonymous/ComfyUI/blob/8c6493578b3dda233e9b9a953feeaf1e6ca434ad/comfy_extras/nodes_mask.py#L309
+def expand_mask(mask, expand, tapered_corners):
+ c = 0 if tapered_corners else 1
+ kernel = np.array([[c, 1, c],
+ [1, 1, 1],
+ [c, 1, c]])
+ mask = mask.reshape((-1, mask.shape[-2], mask.shape[-1]))
+ out = []
+ for m in mask:
+ output = m.numpy()
+ for _ in range(abs(expand)):
+ if expand < 0:
+ output = scipy.ndimage.grey_erosion(output, footprint=kernel)
+ else:
+ output = scipy.ndimage.grey_dilation(output, footprint=kernel)
+ output = torch.from_numpy(output)
+ out.append(output)
+ return torch.stack(out, dim=0)
+
+class Mesh_Graphormer_Depth_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ mask_bbox_padding=("INT", {"default": 30, "min": 0, "max": 100}),
+ resolution=INPUT.RESOLUTION(),
+ mask_type=INPUT.COMBO(["based_on_depth", "tight_bboxes", "original"]),
+ mask_expand=INPUT.INT(default=5, min=-MAX_RESOLUTION, max=MAX_RESOLUTION),
+ rand_seed=INPUT.INT(default=88, min=0, max=0xffffffffffffffff),
+ detect_thr=INPUT.FLOAT(default=0.6, min=0.1),
+ presence_thr=INPUT.FLOAT(default=0.6, min=0.1)
+ )
+
+ RETURN_TYPES = ("IMAGE", "MASK")
+ RETURN_NAMES = ("IMAGE", "INPAINTING_MASK")
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, mask_bbox_padding=30, mask_type="based_on_depth", mask_expand=5, resolution=512, rand_seed=88, detect_thr=0.6, presence_thr=0.6, **kwargs):
+ install_deps()
+ from custom_controlnet_aux.mesh_graphormer import MeshGraphormerDetector
+ model = kwargs["model"] if "model" in kwargs \
+ else MeshGraphormerDetector.from_pretrained(detect_thr=detect_thr, presence_thr=presence_thr).to(model_management.get_torch_device())
+
+ depth_map_list = []
+ mask_list = []
+ for single_image in image:
+ np_image = np.asarray(single_image.cpu() * 255., dtype=np.uint8)
+ depth_map, mask, info = model(np_image, output_type="np", detect_resolution=resolution, mask_bbox_padding=mask_bbox_padding, seed=rand_seed)
+ if mask_type == "based_on_depth":
+ H, W = mask.shape[:2]
+ mask = cv2.resize(depth_map.copy(), (W, H))
+ mask[mask > 0] = 255
+
+ elif mask_type == "tight_bboxes":
+ mask = np.zeros_like(mask)
+ hand_bboxes = (info or {}).get("abs_boxes") or []
+ for hand_bbox in hand_bboxes:
+ x_min, x_max, y_min, y_max = hand_bbox
+ mask[y_min:y_max+1, x_min:x_max+1, :] = 255 #HWC
+
+ mask = mask[:, :, :1]
+ depth_map_list.append(torch.from_numpy(depth_map.astype(np.float32) / 255.0))
+ mask_list.append(torch.from_numpy(mask.astype(np.float32) / 255.0))
+ depth_maps, masks = torch.stack(depth_map_list, dim=0), rearrange(torch.stack(mask_list, dim=0), "n h w 1 -> n 1 h w")
+ return depth_maps, expand_mask(masks, mask_expand, tapered_corners=True)
+
+def normalize_size_base_64(w, h):
+ short_side = min(w, h)
+ remainder = short_side % 64
+ return short_side - remainder + (64 if remainder > 0 else 0)
+
+class Mesh_Graphormer_With_ImpactDetector_Depth_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ types = define_preprocessor_inputs(
+ # Impact pack
+ bbox_threshold=INPUT.FLOAT(default=0.5, min=0.1),
+ bbox_dilation=INPUT.INT(default=10, min=-512, max=512),
+ bbox_crop_factor=INPUT.FLOAT(default=3.0, min=1.0, max=10.0),
+ drop_size=INPUT.INT(default=10, min=1, max=MAX_RESOLUTION),
+ # Mesh Graphormer
+ mask_bbox_padding=INPUT.INT(default=30, min=0, max=100),
+ mask_type=INPUT.COMBO(["based_on_depth", "tight_bboxes", "original"]),
+ mask_expand=INPUT.INT(default=5, min=-MAX_RESOLUTION, max=MAX_RESOLUTION),
+ rand_seed=INPUT.INT(default=88, min=0, max=0xffffffffffffffff),
+ resolution=INPUT.RESOLUTION()
+ )
+ types["required"]["bbox_detector"] = ("BBOX_DETECTOR", )
+ return types
+
+ RETURN_TYPES = ("IMAGE", "MASK")
+ RETURN_NAMES = ("IMAGE", "INPAINTING_MASK")
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, bbox_detector, bbox_threshold=0.5, bbox_dilation=10, bbox_crop_factor=3.0, drop_size=10, resolution=512, **mesh_graphormer_kwargs):
+ install_deps()
+ from custom_controlnet_aux.mesh_graphormer import MeshGraphormerDetector
+ mesh_graphormer_node = Mesh_Graphormer_Depth_Map_Preprocessor()
+ model = MeshGraphormerDetector.from_pretrained(detect_thr=0.6, presence_thr=0.6).to(model_management.get_torch_device())
+ mesh_graphormer_kwargs["model"] = model
+
+ frames = image
+ depth_maps, masks = [], []
+ for idx in range(len(frames)):
+ frame = frames[idx:idx+1,...] #Impact Pack's BBOX_DETECTOR only supports single batch image
+ bbox_detector.setAux('face') # make default prompt as 'face' if empty prompt for CLIPSeg
+ _, segs = bbox_detector.detect(frame, bbox_threshold, bbox_dilation, bbox_crop_factor, drop_size)
+ bbox_detector.setAux(None)
+
+ n, h, w, _ = frame.shape
+ depth_map, mask = torch.zeros_like(frame), torch.zeros(n, 1, h, w)
+ for i, seg in enumerate(segs):
+ x1, y1, x2, y2 = seg.crop_region
+ cropped_image = frame[:, y1:y2, x1:x2, :] # Never use seg.cropped_image to handle overlapping area
+ mesh_graphormer_kwargs["resolution"] = 0 #Disable resizing
+ sub_depth_map, sub_mask = mesh_graphormer_node.execute(cropped_image, **mesh_graphormer_kwargs)
+ depth_map[:, y1:y2, x1:x2, :] = sub_depth_map
+ mask[:, :, y1:y2, x1:x2] = sub_mask
+
+ depth_maps.append(depth_map)
+ masks.append(mask)
+
+ return (torch.cat(depth_maps), torch.cat(masks))
+
+NODE_CLASS_MAPPINGS = {
+ "MeshGraphormer-DepthMapPreprocessor": Mesh_Graphormer_Depth_Map_Preprocessor,
+ "MeshGraphormer+ImpactDetector-DepthMapPreprocessor": Mesh_Graphormer_With_ImpactDetector_Depth_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "MeshGraphormer-DepthMapPreprocessor": "MeshGraphormer Hand Refiner",
+ "MeshGraphormer+ImpactDetector-DepthMapPreprocessor": "MeshGraphormer Hand Refiner With External Detector"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/metric3d.py b/comfyui_controlnet_aux/node_wrappers/metric3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6711fd42403322d60df08675b5f16e8e346f10
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/metric3d.py
@@ -0,0 +1,57 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT, MAX_RESOLUTION
+import comfy.model_management as model_management
+
+class Metric3D_Depth_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ backbone=INPUT.COMBO(["vit-small", "vit-large", "vit-giant2"]),
+ fx=INPUT.INT(default=1000, min=1, max=MAX_RESOLUTION),
+ fy=INPUT.INT(default=1000, min=1, max=MAX_RESOLUTION),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, backbone="vit-small", fx=1000, fy=1000, resolution=512):
+ from custom_controlnet_aux.metric3d import Metric3DDetector
+ model = Metric3DDetector.from_pretrained(filename=f"metric_depth_{backbone.replace('-', '_')}_800k.pth").to(model_management.get_torch_device())
+ cb = lambda image, **kwargs: model(image, **kwargs)[0]
+ out = common_annotator_call(cb, image, resolution=resolution, fx=fx, fy=fy, depth_and_normal=True)
+ del model
+ return (out, )
+
+class Metric3D_Normal_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ backbone=INPUT.COMBO(["vit-small", "vit-large", "vit-giant2"]),
+ fx=INPUT.INT(default=1000, min=1, max=MAX_RESOLUTION),
+ fy=INPUT.INT(default=1000, min=1, max=MAX_RESOLUTION),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, backbone="vit-small", fx=1000, fy=1000, resolution=512):
+ from custom_controlnet_aux.metric3d import Metric3DDetector
+ model = Metric3DDetector.from_pretrained(filename=f"metric_depth_{backbone.replace('-', '_')}_800k.pth").to(model_management.get_torch_device())
+ cb = lambda image, **kwargs: model(image, **kwargs)[1]
+ out = common_annotator_call(cb, image, resolution=resolution, fx=fx, fy=fy, depth_and_normal=True)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "Metric3D-DepthMapPreprocessor": Metric3D_Depth_Map_Preprocessor,
+ "Metric3D-NormalMapPreprocessor": Metric3D_Normal_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "Metric3D-DepthMapPreprocessor": "Metric3D Depth Map",
+ "Metric3D-NormalMapPreprocessor": "Metric3D Normal Map"
+}
diff --git a/comfyui_controlnet_aux/node_wrappers/midas.py b/comfyui_controlnet_aux/node_wrappers/midas.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee59637211a675e2e5ad49e9532c3bf7dea222f
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/midas.py
@@ -0,0 +1,59 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+import numpy as np
+
+class MIDAS_Normal_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ a=INPUT.FLOAT(default=np.pi * 2.0, min=0.0, max=np.pi * 5.0),
+ bg_threshold=INPUT.FLOAT(default=0.1),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, a=np.pi * 2.0, bg_threshold=0.1, resolution=512, **kwargs):
+ from custom_controlnet_aux.midas import MidasDetector
+
+ model = MidasDetector.from_pretrained().to(model_management.get_torch_device())
+ #Dirty hack :))
+ cb = lambda image, **kargs: model(image, **kargs)[1]
+ out = common_annotator_call(cb, image, resolution=resolution, a=a, bg_th=bg_threshold, depth_and_normal=True)
+ del model
+ return (out, )
+
+class MIDAS_Depth_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ a=INPUT.FLOAT(default=np.pi * 2.0, min=0.0, max=np.pi * 5.0),
+ bg_threshold=INPUT.FLOAT(default=0.1),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, a=np.pi * 2.0, bg_threshold=0.1, resolution=512, **kwargs):
+ from custom_controlnet_aux.midas import MidasDetector
+
+ # Ref: https://github.com/lllyasviel/ControlNet/blob/main/gradio_depth2image.py
+ model = MidasDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, a=a, bg_th=bg_threshold)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "MiDaS-NormalMapPreprocessor": MIDAS_Normal_Map_Preprocessor,
+ "MiDaS-DepthMapPreprocessor": MIDAS_Depth_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "MiDaS-NormalMapPreprocessor": "MiDaS Normal Map",
+ "MiDaS-DepthMapPreprocessor": "MiDaS Depth Map"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/mlsd.py b/comfyui_controlnet_aux/node_wrappers/mlsd.py
new file mode 100644
index 0000000000000000000000000000000000000000..66688b9d1fb6e239a7cf8a7b89cf3a81b3074b93
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/mlsd.py
@@ -0,0 +1,31 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+import numpy as np
+
+class MLSD_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ score_threshold=INPUT.FLOAT(default=0.1, min=0.01, max=2.0),
+ dist_threshold=INPUT.FLOAT(default=0.1, min=0.01, max=20.0),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, score_threshold, dist_threshold, resolution=512, **kwargs):
+ from custom_controlnet_aux.mlsd import MLSDdetector
+
+ model = MLSDdetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, thr_v=score_threshold, thr_d=dist_threshold)
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "M-LSDPreprocessor": MLSD_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "M-LSDPreprocessor": "M-LSD Lines"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/normalbae.py b/comfyui_controlnet_aux/node_wrappers/normalbae.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a326a002dc0baed5ecfe6d2c7552ca83164c705
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/normalbae.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class BAE_Normal_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.normalbae import NormalBaeDetector
+
+ model = NormalBaeDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out,)
+
+NODE_CLASS_MAPPINGS = {
+ "BAE-NormalMapPreprocessor": BAE_Normal_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "BAE-NormalMapPreprocessor": "BAE Normal Map"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/oneformer.py b/comfyui_controlnet_aux/node_wrappers/oneformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cafcfac8faf91a52b1f2d56004b622566bd16a42
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/oneformer.py
@@ -0,0 +1,50 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class OneFormer_COCO_SemSegPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "semantic_segmentate"
+
+ CATEGORY = "ControlNet Preprocessors/Semantic Segmentation"
+
+ def semantic_segmentate(self, image, resolution=512):
+ from custom_controlnet_aux.oneformer import OneformerSegmentor
+
+ model = OneformerSegmentor.from_pretrained(filename="150_16_swin_l_oneformer_coco_100ep.pth")
+ model = model.to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out,)
+
+class OneFormer_ADE20K_SemSegPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "semantic_segmentate"
+
+ CATEGORY = "ControlNet Preprocessors/Semantic Segmentation"
+
+ def semantic_segmentate(self, image, resolution=512):
+ from custom_controlnet_aux.oneformer import OneformerSegmentor
+
+ model = OneformerSegmentor.from_pretrained(filename="250_16_swin_l_oneformer_ade20k_160k.pth")
+ model = model.to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out,)
+
+NODE_CLASS_MAPPINGS = {
+ "OneFormer-COCO-SemSegPreprocessor": OneFormer_COCO_SemSegPreprocessor,
+ "OneFormer-ADE20K-SemSegPreprocessor": OneFormer_ADE20K_SemSegPreprocessor
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "OneFormer-COCO-SemSegPreprocessor": "OneFormer COCO Segmentor",
+ "OneFormer-ADE20K-SemSegPreprocessor": "OneFormer ADE20K Segmentor"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/openpose.py b/comfyui_controlnet_aux/node_wrappers/openpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ce98bfa33ccd0e9dd216041d341e281e1ff7c4
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/openpose.py
@@ -0,0 +1,48 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+import json
+
+class OpenPose_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ detect_hand=INPUT.COMBO(["enable", "disable"]),
+ detect_body=INPUT.COMBO(["enable", "disable"]),
+ detect_face=INPUT.COMBO(["enable", "disable"]),
+ resolution=INPUT.RESOLUTION(),
+ scale_stick_for_xinsr_cn=INPUT.COMBO(["disable", "enable"])
+ )
+
+ RETURN_TYPES = ("IMAGE", "POSE_KEYPOINT")
+ FUNCTION = "estimate_pose"
+
+ CATEGORY = "ControlNet Preprocessors/Faces and Poses Estimators"
+
+ def estimate_pose(self, image, detect_hand="enable", detect_body="enable", detect_face="enable", scale_stick_for_xinsr_cn="disable", resolution=512, **kwargs):
+ from custom_controlnet_aux.open_pose import OpenposeDetector
+
+ detect_hand = detect_hand == "enable"
+ detect_body = detect_body == "enable"
+ detect_face = detect_face == "enable"
+ scale_stick_for_xinsr_cn = scale_stick_for_xinsr_cn == "enable"
+
+ model = OpenposeDetector.from_pretrained().to(model_management.get_torch_device())
+ self.openpose_dicts = []
+ def func(image, **kwargs):
+ pose_img, openpose_dict = model(image, **kwargs)
+ self.openpose_dicts.append(openpose_dict)
+ return pose_img
+
+ out = common_annotator_call(func, image, include_hand=detect_hand, include_face=detect_face, include_body=detect_body, image_and_json=True, xinsr_stick_scaling=scale_stick_for_xinsr_cn, resolution=resolution)
+ del model
+ return {
+ 'ui': { "openpose_json": [json.dumps(self.openpose_dicts, indent=4)] },
+ "result": (out, self.openpose_dicts)
+ }
+
+NODE_CLASS_MAPPINGS = {
+ "OpenposePreprocessor": OpenPose_Preprocessor,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "OpenposePreprocessor": "OpenPose Pose",
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/pidinet.py b/comfyui_controlnet_aux/node_wrappers/pidinet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f150de66e1a451e049535fdd26acb81f091df114
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/pidinet.py
@@ -0,0 +1,30 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class PIDINET_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ safe=INPUT.COMBO(["enable", "disable"]),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, safe, resolution=512, **kwargs):
+ from custom_controlnet_aux.pidi import PidiNetDetector
+
+ model = PidiNetDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, safe = safe == "enable")
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "PiDiNetPreprocessor": PIDINET_Preprocessor,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "PiDiNetPreprocessor": "PiDiNet Soft-Edge Lines"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/pose_keypoint_postprocess.py b/comfyui_controlnet_aux/node_wrappers/pose_keypoint_postprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea4ea47781b626eb8d664990a5fe7883a97c67e
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/pose_keypoint_postprocess.py
@@ -0,0 +1,340 @@
+import folder_paths
+import json
+import os
+import numpy as np
+import cv2
+from PIL import ImageColor
+from einops import rearrange
+import torch
+import itertools
+
+from ..src.custom_controlnet_aux.dwpose import draw_poses, draw_animalposes, decode_json_as_poses
+
+
+"""
+Format of POSE_KEYPOINT (AP10K keypoints):
+[{
+ "version": "ap10k",
+ "animals": [
+ [[x1, y1, 1], [x2, y2, 1],..., [x17, y17, 1]],
+ [[x1, y1, 1], [x2, y2, 1],..., [x17, y17, 1]],
+ ...
+ ],
+ "canvas_height": 512,
+ "canvas_width": 768
+},...]
+Format of POSE_KEYPOINT (OpenPose keypoints):
+[{
+ "people": [
+ {
+ 'pose_keypoints_2d': [[x1, y1, 1], [x2, y2, 1],..., [x17, y17, 1]]
+ "face_keypoints_2d": [[x1, y1, 1], [x2, y2, 1],..., [x68, y68, 1]],
+ "hand_left_keypoints_2d": [[x1, y1, 1], [x2, y2, 1],..., [x21, y21, 1]],
+ "hand_right_keypoints_2d":[[x1, y1, 1], [x2, y2, 1],..., [x21, y21, 1]],
+ }
+ ],
+ "canvas_height": canvas_height,
+ "canvas_width": canvas_width,
+},...]
+"""
+
+class SavePoseKpsAsJsonFile:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "pose_kps": ("POSE_KEYPOINT",),
+ "filename_prefix": ("STRING", {"default": "PoseKeypoint"})
+ }
+ }
+ RETURN_TYPES = ()
+ FUNCTION = "save_pose_kps"
+ OUTPUT_NODE = True
+ CATEGORY = "ControlNet Preprocessors/Pose Keypoint Postprocess"
+ def __init__(self):
+ self.output_dir = folder_paths.get_output_directory()
+ self.type = "output"
+ self.prefix_append = ""
+ def save_pose_kps(self, pose_kps, filename_prefix):
+ filename_prefix += self.prefix_append
+ full_output_folder, filename, counter, subfolder, filename_prefix = \
+ folder_paths.get_save_image_path(filename_prefix, self.output_dir, pose_kps[0]["canvas_width"], pose_kps[0]["canvas_height"])
+ file = f"{filename}_{counter:05}.json"
+ with open(os.path.join(full_output_folder, file), 'w') as f:
+ json.dump(pose_kps , f)
+ return {}
+
+#COCO-Wholebody doesn't have eyebrows as it inherits 68 keypoints format
+#Perhaps eyebrows can be estimated tho
+FACIAL_PARTS = ["skin", "left_eye", "right_eye", "nose", "upper_lip", "inner_mouth", "lower_lip"]
+LAPA_COLORS = dict(
+ skin="rgb(0, 153, 255)",
+ left_eye="rgb(0, 204, 153)",
+ right_eye="rgb(255, 153, 0)",
+ nose="rgb(255, 102, 255)",
+ upper_lip="rgb(102, 0, 51)",
+ inner_mouth="rgb(255, 204, 255)",
+ lower_lip="rgb(255, 0, 102)"
+)
+
+#One-based index
+def kps_idxs(start, end):
+ step = -1 if start > end else 1
+ return list(range(start-1, end+1-1, step))
+
+#Source: https://www.researchgate.net/profile/Fabrizio-Falchi/publication/338048224/figure/fig1/AS:837860722741255@1576772971540/68-facial-landmarks.jpg
+FACIAL_PART_RANGES = dict(
+ skin=kps_idxs(1, 17) + kps_idxs(27, 18),
+ nose=kps_idxs(28, 36),
+ left_eye=kps_idxs(37, 42),
+ right_eye=kps_idxs(43, 48),
+ upper_lip=kps_idxs(49, 55) + kps_idxs(65, 61),
+ lower_lip=kps_idxs(61, 68),
+ inner_mouth=kps_idxs(61, 65) + kps_idxs(55, 49)
+)
+
+def is_normalized(keypoints) -> bool:
+ point_normalized = [
+ 0 <= np.abs(k[0]) <= 1 and 0 <= np.abs(k[1]) <= 1
+ for k in keypoints
+ if k is not None
+ ]
+ if not point_normalized:
+ return False
+ return np.all(point_normalized)
+
+class FacialPartColoringFromPoseKps:
+ @classmethod
+ def INPUT_TYPES(s):
+ input_types = {
+ "required": {"pose_kps": ("POSE_KEYPOINT",), "mode": (["point", "polygon"], {"default": "polygon"})}
+ }
+ for facial_part in FACIAL_PARTS:
+ input_types["required"][facial_part] = ("STRING", {"default": LAPA_COLORS[facial_part], "multiline": False})
+ return input_types
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "colorize"
+ CATEGORY = "ControlNet Preprocessors/Pose Keypoint Postprocess"
+ def colorize(self, pose_kps, mode, **facial_part_colors):
+ pose_frames = pose_kps
+ np_frames = [self.draw_kps(pose_frame, mode, **facial_part_colors) for pose_frame in pose_frames]
+ np_frames = np.stack(np_frames, axis=0)
+ return (torch.from_numpy(np_frames).float() / 255.,)
+
+ def draw_kps(self, pose_frame, mode, **facial_part_colors):
+ width, height = pose_frame["canvas_width"], pose_frame["canvas_height"]
+ canvas = np.zeros((height, width, 3), dtype=np.uint8)
+ for person, part_name in itertools.product(pose_frame["people"], FACIAL_PARTS):
+ n = len(person["face_keypoints_2d"]) // 3
+ facial_kps = rearrange(np.array(person["face_keypoints_2d"]), "(n c) -> n c", n=n, c=3)[:, :2]
+ if is_normalized(facial_kps):
+ facial_kps *= (width, height)
+ facial_kps = facial_kps.astype(np.int32)
+ part_color = ImageColor.getrgb(facial_part_colors[part_name])[:3]
+ part_contours = facial_kps[FACIAL_PART_RANGES[part_name], :]
+ if mode == "point":
+ for pt in part_contours:
+ cv2.circle(canvas, pt, radius=2, color=part_color, thickness=-1)
+ else:
+ cv2.fillPoly(canvas, pts=[part_contours], color=part_color)
+ return canvas
+
+# https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/.github/media/keypoints_pose_18.png
+BODY_PART_INDEXES = {
+ "Head": (16, 14, 0, 15, 17),
+ "Neck": (0, 1),
+ "Shoulder": (2, 5),
+ "Torso": (2, 5, 8, 11),
+ "RArm": (2, 3),
+ "RForearm": (3, 4),
+ "LArm": (5, 6),
+ "LForearm": (6, 7),
+ "RThigh": (8, 9),
+ "RLeg": (9, 10),
+ "LThigh": (11, 12),
+ "LLeg": (12, 13)
+}
+BODY_PART_DEFAULT_W_H = {
+ "Head": "256, 256",
+ "Neck": "100, 100",
+ "Shoulder": '',
+ "Torso": "350, 450",
+ "RArm": "128, 256",
+ "RForearm": "128, 256",
+ "LArm": "128, 256",
+ "LForearm": "128, 256",
+ "RThigh": "128, 256",
+ "RLeg": "128, 256",
+ "LThigh": "128, 256",
+ "LLeg": "128, 256"
+}
+
+class SinglePersonProcess:
+ @classmethod
+ def sort_and_get_max_people(s, pose_kps):
+ for idx in range(len(pose_kps)):
+ pose_kps[idx]["people"] = sorted(pose_kps[idx]["people"], key=lambda person:person["pose_keypoints_2d"][0])
+ return pose_kps, max(len(frame["people"]) for frame in pose_kps)
+
+ def __init__(self, pose_kps, person_idx=0) -> None:
+ self.width, self.height = pose_kps[0]["canvas_width"], pose_kps[0]["canvas_height"]
+ self.poses = [
+ self.normalize(pose_frame["people"][person_idx]["pose_keypoints_2d"])
+ if person_idx < len(pose_frame["people"])
+ else None
+ for pose_frame in pose_kps
+ ]
+
+ def normalize(self, pose_kps_2d):
+ n = len(pose_kps_2d) // 3
+ pose_kps_2d = rearrange(np.array(pose_kps_2d), "(n c) -> n c", n=n, c=3)
+ pose_kps_2d[np.argwhere(pose_kps_2d[:,2]==0), :] = np.iinfo(np.int32).max // 2 #Safe large value
+ pose_kps_2d = pose_kps_2d[:, :2]
+ if is_normalized(pose_kps_2d):
+ pose_kps_2d *= (self.width, self.height)
+ return pose_kps_2d
+
+ def get_xyxy_bboxes(self, part_name, bbox_size=(128, 256)):
+ width, height = bbox_size
+ xyxy_bboxes = {}
+ for idx, pose in enumerate(self.poses):
+ if pose is None:
+ xyxy_bboxes[idx] = (np.iinfo(np.int32).max // 2,) * 4
+ continue
+ pts = pose[BODY_PART_INDEXES[part_name], :]
+
+ #top_left = np.min(pts[:,0]), np.min(pts[:,1])
+ #bottom_right = np.max(pts[:,0]), np.max(pts[:,1])
+ #pad_width = np.maximum(width - (bottom_right[0]-top_left[0]), 0) / 2
+ #pad_height = np.maximum(height - (bottom_right[1]-top_left[1]), 0) / 2
+ #xyxy_bboxes.append((
+ # top_left[0] - pad_width, top_left[1] - pad_height,
+ # bottom_right[0] + pad_width, bottom_right[1] + pad_height,
+ #))
+
+ x_mid, y_mid = np.mean(pts[:, 0]), np.mean(pts[:, 1])
+ xyxy_bboxes[idx] = (
+ x_mid - width/2, y_mid - height/2,
+ x_mid + width/2, y_mid + height/2
+ )
+ return xyxy_bboxes
+
+class UpperBodyTrackingFromPoseKps:
+ PART_NAMES = ["Head", "Neck", "Shoulder", "Torso", "RArm", "RForearm", "LArm", "LForearm"]
+
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "pose_kps": ("POSE_KEYPOINT",),
+ "id_include": ("STRING", {"default": '', "multiline": False}),
+ **{part_name + "_width_height": ("STRING", {"default": BODY_PART_DEFAULT_W_H[part_name], "multiline": False}) for part_name in s.PART_NAMES}
+ }
+ }
+
+ RETURN_TYPES = ("TRACKING", "STRING")
+ RETURN_NAMES = ("tracking", "prompt")
+ FUNCTION = "convert"
+ CATEGORY = "ControlNet Preprocessors/Pose Keypoint Postprocess"
+
+ def convert(self, pose_kps, id_include, **parts_width_height):
+ parts_width_height = {part_name.replace("_width_height", ''): value for part_name, value in parts_width_height.items()}
+ enabled_part_names = [part_name for part_name in self.PART_NAMES if len(parts_width_height[part_name].strip())]
+ tracked = {part_name: {} for part_name in enabled_part_names}
+ id_include = id_include.strip()
+ id_include = list(map(int, id_include.split(','))) if len(id_include) else []
+ prompt_string = ''
+ pose_kps, max_people = SinglePersonProcess.sort_and_get_max_people(pose_kps)
+
+ for person_idx in range(max_people):
+ if len(id_include) and person_idx not in id_include:
+ continue
+ processor = SinglePersonProcess(pose_kps, person_idx)
+ for part_name in enabled_part_names:
+ bbox_size = tuple(map(int, parts_width_height[part_name].split(',')))
+ part_bboxes = processor.get_xyxy_bboxes(part_name, bbox_size)
+ id_coordinates = {idx: part_bbox+(processor.width, processor.height) for idx, part_bbox in part_bboxes.items()}
+ tracked[part_name][person_idx] = id_coordinates
+
+ for class_name, class_data in tracked.items():
+ for class_id in class_data.keys():
+ class_id_str = str(class_id)
+ # Use the incoming prompt for each class name and ID
+ _class_name = class_name.replace('L', '').replace('R', '').lower()
+ prompt_string += f'"{class_id_str}.{class_name}": "({_class_name})",\n'
+
+ return (tracked, prompt_string)
+
+
+def numpy2torch(np_image: np.ndarray) -> torch.Tensor:
+ """ [H, W, C] => [B=1, H, W, C]"""
+ return torch.from_numpy(np_image.astype(np.float32) / 255).unsqueeze(0)
+
+
+class RenderPeopleKps:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "kps": ("POSE_KEYPOINT",),
+ "render_body": ("BOOLEAN", {"default": True}),
+ "render_hand": ("BOOLEAN", {"default": True}),
+ "render_face": ("BOOLEAN", {"default": True}),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "render"
+ CATEGORY = "ControlNet Preprocessors/Pose Keypoint Postprocess"
+
+ def render(self, kps, render_body, render_hand, render_face) -> tuple[np.ndarray]:
+ if isinstance(kps, list):
+ kps = kps[0]
+
+ poses, _, height, width = decode_json_as_poses(kps)
+ np_image = draw_poses(
+ poses,
+ height,
+ width,
+ render_body,
+ render_hand,
+ render_face,
+ )
+ return (numpy2torch(np_image),)
+
+class RenderAnimalKps:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "kps": ("POSE_KEYPOINT",),
+ }
+ }
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "render"
+ CATEGORY = "ControlNet Preprocessors/Pose Keypoint Postprocess"
+
+ def render(self, kps) -> tuple[np.ndarray]:
+ if isinstance(kps, list):
+ kps = kps[0]
+
+ _, poses, height, width = decode_json_as_poses(kps)
+ np_image = draw_animalposes(poses, height, width)
+ return (numpy2torch(np_image),)
+
+
+NODE_CLASS_MAPPINGS = {
+ "SavePoseKpsAsJsonFile": SavePoseKpsAsJsonFile,
+ "FacialPartColoringFromPoseKps": FacialPartColoringFromPoseKps,
+ "UpperBodyTrackingFromPoseKps": UpperBodyTrackingFromPoseKps,
+ "RenderPeopleKps": RenderPeopleKps,
+ "RenderAnimalKps": RenderAnimalKps,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "SavePoseKpsAsJsonFile": "Save Pose Keypoints",
+ "FacialPartColoringFromPoseKps": "Colorize Facial Parts from PoseKPS",
+ "UpperBodyTrackingFromPoseKps": "Upper Body Tracking From PoseKps (InstanceDiffusion)",
+ "RenderPeopleKps": "Render Pose JSON (Human)",
+ "RenderAnimalKps": "Render Pose JSON (Animal)",
+}
diff --git a/comfyui_controlnet_aux/node_wrappers/pyracanny.py b/comfyui_controlnet_aux/node_wrappers/pyracanny.py
new file mode 100644
index 0000000000000000000000000000000000000000..aae78918941672bb4d2bbd861f23b20f723e44a4
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/pyracanny.py
@@ -0,0 +1,30 @@
+from ..utils import common_annotator_call, INPUT, define_preprocessor_inputs
+import comfy.model_management as model_management
+
+class PyraCanny_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ low_threshold=INPUT.INT(default=64, max=255),
+ high_threshold=INPUT.INT(default=128, max=255),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, low_threshold=64, high_threshold=128, resolution=512, **kwargs):
+ from custom_controlnet_aux.pyracanny import PyraCannyDetector
+
+ return (common_annotator_call(PyraCannyDetector(), image, low_threshold=low_threshold, high_threshold=high_threshold, resolution=resolution), )
+
+
+
+NODE_CLASS_MAPPINGS = {
+ "PyraCannyPreprocessor": PyraCanny_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "PyraCannyPreprocessor": "PyraCanny"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/recolor.py b/comfyui_controlnet_aux/node_wrappers/recolor.py
new file mode 100644
index 0000000000000000000000000000000000000000..859cee413b1fd8cd774ed76bce6fa497b72ca2b2
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/recolor.py
@@ -0,0 +1,46 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+
+class ImageLuminanceDetector:
+ @classmethod
+ def INPUT_TYPES(s):
+ #https://github.com/Mikubill/sd-webui-controlnet/blob/416c345072c9c2066101e225964e3986abe6945e/scripts/processor.py#L1229
+ return define_preprocessor_inputs(
+ gamma_correction=INPUT.FLOAT(default=1.0, min=0.1, max=2.0),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Recolor"
+
+ def execute(self, image, gamma_correction=1.0, resolution=512, **kwargs):
+ from custom_controlnet_aux.recolor import Recolorizer
+ return (common_annotator_call(Recolorizer(), image, mode="luminance", gamma_correction=gamma_correction , resolution=resolution), )
+
+class ImageIntensityDetector:
+ @classmethod
+ def INPUT_TYPES(s):
+ #https://github.com/Mikubill/sd-webui-controlnet/blob/416c345072c9c2066101e225964e3986abe6945e/scripts/processor.py#L1229
+ return define_preprocessor_inputs(
+ gamma_correction=INPUT.FLOAT(default=1.0, min=0.1, max=2.0),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Recolor"
+
+ def execute(self, image, gamma_correction=1.0, resolution=512, **kwargs):
+ from custom_controlnet_aux.recolor import Recolorizer
+ return (common_annotator_call(Recolorizer(), image, mode="intensity", gamma_correction=gamma_correction , resolution=resolution), )
+
+NODE_CLASS_MAPPINGS = {
+ "ImageLuminanceDetector": ImageLuminanceDetector,
+ "ImageIntensityDetector": ImageIntensityDetector
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "ImageLuminanceDetector": "Image Luminance",
+ "ImageIntensityDetector": "Image Intensity"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/scribble.py b/comfyui_controlnet_aux/node_wrappers/scribble.py
new file mode 100644
index 0000000000000000000000000000000000000000..154a12ea1d4c69a57dddb114d26382ca9d25d457
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/scribble.py
@@ -0,0 +1,74 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT, nms
+import comfy.model_management as model_management
+import cv2
+
+class Scribble_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.scribble import ScribbleDetector
+
+ model = ScribbleDetector()
+ return (common_annotator_call(model, image, resolution=resolution), )
+
+class Scribble_XDoG_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ threshold=INPUT.INT(default=32, min=1, max=64),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, threshold=32, resolution=512, **kwargs):
+ from custom_controlnet_aux.scribble import ScribbleXDog_Detector
+
+ model = ScribbleXDog_Detector()
+ return (common_annotator_call(model, image, resolution=resolution, thr_a=threshold), )
+
+class Scribble_PiDiNet_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ safe=(["enable", "disable"],),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, safe="enable", resolution=512):
+ def model(img, **kwargs):
+ from custom_controlnet_aux.pidi import PidiNetDetector
+ pidinet = PidiNetDetector.from_pretrained().to(model_management.get_torch_device())
+ result = pidinet(img, scribble=True, **kwargs)
+ result = nms(result, 127, 3.0)
+ result = cv2.GaussianBlur(result, (0, 0), 3.0)
+ result[result > 4] = 255
+ result[result < 255] = 0
+ return result
+ return (common_annotator_call(model, image, resolution=resolution, safe=safe=="enable"),)
+
+NODE_CLASS_MAPPINGS = {
+ "ScribblePreprocessor": Scribble_Preprocessor,
+ "Scribble_XDoG_Preprocessor": Scribble_XDoG_Preprocessor,
+ "Scribble_PiDiNet_Preprocessor": Scribble_PiDiNet_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "ScribblePreprocessor": "Scribble Lines",
+ "Scribble_XDoG_Preprocessor": "Scribble XDoG Lines",
+ "Scribble_PiDiNet_Preprocessor": "Scribble PiDiNet Lines"
+}
diff --git a/comfyui_controlnet_aux/node_wrappers/segment_anything.py b/comfyui_controlnet_aux/node_wrappers/segment_anything.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d95e9fbda170aed8f76177e1fa5d0106f2bb82
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/segment_anything.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class SAM_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/others"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.sam import SamDetector
+
+ mobile_sam = SamDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(mobile_sam, image, resolution=resolution)
+ del mobile_sam
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "SAMPreprocessor": SAM_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "SAMPreprocessor": "SAM Segmentor"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/shuffle.py b/comfyui_controlnet_aux/node_wrappers/shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bfa5dd854eacbf4573012d0e53839d88de12c40
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/shuffle.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT, MAX_RESOLUTION
+import comfy.model_management as model_management
+
+class Shuffle_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ resolution=INPUT.RESOLUTION(),
+ seed=INPUT.SEED()
+ )
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "preprocess"
+
+ CATEGORY = "ControlNet Preprocessors/T2IAdapter-only"
+
+ def preprocess(self, image, resolution=512, seed=0):
+ from custom_controlnet_aux.shuffle import ContentShuffleDetector
+
+ return (common_annotator_call(ContentShuffleDetector(), image, resolution=resolution, seed=seed), )
+
+NODE_CLASS_MAPPINGS = {
+ "ShufflePreprocessor": Shuffle_Preprocessor
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "ShufflePreprocessor": "Content Shuffle"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/teed.py b/comfyui_controlnet_aux/node_wrappers/teed.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a6feb7a71fb07789826ac3852cee04b214f735d
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/teed.py
@@ -0,0 +1,30 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class TEED_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ safe_steps=INPUT.INT(default=2, max=10),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Line Extractors"
+
+ def execute(self, image, safe_steps=2, resolution=512, **kwargs):
+ from custom_controlnet_aux.teed import TEDDetector
+
+ model = TEDDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution, safe_steps=safe_steps)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "TEEDPreprocessor": TEED_Preprocessor,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "TEED_Preprocessor": "TEED Soft-Edge Lines",
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/tile.py b/comfyui_controlnet_aux/node_wrappers/tile.py
new file mode 100644
index 0000000000000000000000000000000000000000..2de818d56956cd13111f0f11dcae26dd43d15b23
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/tile.py
@@ -0,0 +1,73 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+
+
+class Tile_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ pyrUp_iters=INPUT.INT(default=3, min=1, max=10),
+ resolution=INPUT.RESOLUTION()
+ )
+
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/tile"
+
+ def execute(self, image, pyrUp_iters, resolution=512, **kwargs):
+ from custom_controlnet_aux.tile import TileDetector
+
+ return (common_annotator_call(TileDetector(), image, pyrUp_iters=pyrUp_iters, resolution=resolution),)
+
+class TTPlanet_TileGF_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ scale_factor=INPUT.FLOAT(default=1.00, min=1.000, max=8.00),
+ blur_strength=INPUT.FLOAT(default=2.0, min=1.0, max=10.0),
+ radius=INPUT.INT(default=7, min=1, max=20),
+ eps=INPUT.FLOAT(default=0.01, min=0.001, max=0.1, step=0.001),
+ resolution=INPUT.RESOLUTION()
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/tile"
+
+ def execute(self, image, scale_factor, blur_strength, radius, eps, **kwargs):
+ from custom_controlnet_aux.tile import TTPlanet_Tile_Detector_GF
+
+ return (common_annotator_call(TTPlanet_Tile_Detector_GF(), image, scale_factor=scale_factor, blur_strength=blur_strength, radius=radius, eps=eps),)
+
+class TTPlanet_TileSimple_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(
+ scale_factor=INPUT.FLOAT(default=1.00, min=1.000, max=8.00),
+ blur_strength=INPUT.FLOAT(default=2.0, min=1.0, max=10.0),
+ )
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/tile"
+
+ def execute(self, image, scale_factor, blur_strength):
+ from custom_controlnet_aux.tile import TTPLanet_Tile_Detector_Simple
+
+ return (common_annotator_call(TTPLanet_Tile_Detector_Simple(), image, scale_factor=scale_factor, blur_strength=blur_strength),)
+
+
+NODE_CLASS_MAPPINGS = {
+ "TilePreprocessor": Tile_Preprocessor,
+ "TTPlanet_TileGF_Preprocessor": TTPlanet_TileGF_Preprocessor,
+ "TTPlanet_TileSimple_Preprocessor": TTPlanet_TileSimple_Preprocessor
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "TilePreprocessor": "Tile",
+ "TTPlanet_TileGF_Preprocessor": "TTPlanet Tile GuidedFilter",
+ "TTPlanet_TileSimple_Preprocessor": "TTPlanet Tile Simple"
+}
diff --git a/comfyui_controlnet_aux/node_wrappers/uniformer.py b/comfyui_controlnet_aux/node_wrappers/uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af01f9d95e1e564035045ddd01a2ba64cf4e296
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/uniformer.py
@@ -0,0 +1,29 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class Uniformer_SemSegPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "semantic_segmentate"
+
+ CATEGORY = "ControlNet Preprocessors/Semantic Segmentation"
+
+ def semantic_segmentate(self, image, resolution=512):
+ from custom_controlnet_aux.uniformer import UniformerSegmentor
+
+ model = UniformerSegmentor.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "UniFormer-SemSegPreprocessor": Uniformer_SemSegPreprocessor,
+ "SemSegPreprocessor": Uniformer_SemSegPreprocessor,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "UniFormer-SemSegPreprocessor": "UniFormer Segmentor",
+ "SemSegPreprocessor": "Semantic Segmentor (legacy, alias for UniFormer)",
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/unimatch.py b/comfyui_controlnet_aux/node_wrappers/unimatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b15ecb24f32cb7deed9a812b8afe0bf1f6e3ffed
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/unimatch.py
@@ -0,0 +1,75 @@
+from ..utils import common_annotator_call
+import comfy.model_management as model_management
+import torch
+import numpy as np
+from einops import rearrange
+import torch.nn.functional as F
+
+class Unimatch_OptFlowPreprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": dict(
+ image=("IMAGE",),
+ ckpt_name=(
+ ["gmflow-scale1-mixdata.pth", "gmflow-scale2-mixdata.pth", "gmflow-scale2-regrefine6-mixdata.pth"],
+ {"default": "gmflow-scale2-regrefine6-mixdata.pth"}
+ ),
+ backward_flow=("BOOLEAN", {"default": False}),
+ bidirectional_flow=("BOOLEAN", {"default": False})
+ )
+ }
+
+ RETURN_TYPES = ("OPTICAL_FLOW", "IMAGE")
+ RETURN_NAMES = ("OPTICAL_FLOW", "PREVIEW_IMAGE")
+ FUNCTION = "estimate"
+
+ CATEGORY = "ControlNet Preprocessors/Optical Flow"
+
+ def estimate(self, image, ckpt_name, backward_flow=False, bidirectional_flow=False):
+ assert len(image) > 1, "[Unimatch] Requiring as least two frames as an optical flow estimator. Only use this node on video input."
+ from custom_controlnet_aux.unimatch import UnimatchDetector
+ tensor_images = image
+ model = UnimatchDetector.from_pretrained(filename=ckpt_name).to(model_management.get_torch_device())
+ flows, vis_flows = [], []
+ for i in range(len(tensor_images) - 1):
+ image0, image1 = np.asarray(image[i:i+2].cpu() * 255., dtype=np.uint8)
+ flow, vis_flow = model(image0, image1, output_type="np", pred_bwd_flow=backward_flow, pred_bidir_flow=bidirectional_flow)
+ flows.append(torch.from_numpy(flow).float())
+ vis_flows.append(torch.from_numpy(vis_flow).float() / 255.)
+ del model
+ return (torch.stack(flows, dim=0), torch.stack(vis_flows, dim=0))
+
+class MaskOptFlow:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": dict(optical_flow=("OPTICAL_FLOW",), mask=("MASK",))
+ }
+
+ RETURN_TYPES = ("OPTICAL_FLOW", "IMAGE")
+ RETURN_NAMES = ("OPTICAL_FLOW", "PREVIEW_IMAGE")
+ FUNCTION = "mask_opt_flow"
+
+ CATEGORY = "ControlNet Preprocessors/Optical Flow"
+
+ def mask_opt_flow(self, optical_flow, mask):
+ from custom_controlnet_aux.unimatch import flow_to_image
+ assert len(mask) >= len(optical_flow), f"Not enough masks to mask optical flow: {len(mask)} vs {len(optical_flow)}"
+ mask = mask[:optical_flow.shape[0]]
+ mask = F.interpolate(mask, optical_flow.shape[1:3])
+ mask = rearrange(mask, "n 1 h w -> n h w 1")
+ vis_flows = torch.stack([torch.from_numpy(flow_to_image(flow)).float() / 255. for flow in optical_flow.numpy()], dim=0)
+ vis_flows *= mask
+ optical_flow *= mask
+ return (optical_flow, vis_flows)
+
+
+NODE_CLASS_MAPPINGS = {
+ "Unimatch_OptFlowPreprocessor": Unimatch_OptFlowPreprocessor,
+ "MaskOptFlow": MaskOptFlow
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "Unimatch_OptFlowPreprocessor": "Unimatch Optical Flow",
+ "MaskOptFlow": "Mask Optical Flow (DragNUWA)"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/node_wrappers/zoe.py b/comfyui_controlnet_aux/node_wrappers/zoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a0175b5d15644628a5639a32c1061f0d604783
--- /dev/null
+++ b/comfyui_controlnet_aux/node_wrappers/zoe.py
@@ -0,0 +1,27 @@
+from ..utils import common_annotator_call, define_preprocessor_inputs, INPUT
+import comfy.model_management as model_management
+
+class Zoe_Depth_Map_Preprocessor:
+ @classmethod
+ def INPUT_TYPES(s):
+ return define_preprocessor_inputs(resolution=INPUT.RESOLUTION())
+
+ RETURN_TYPES = ("IMAGE",)
+ FUNCTION = "execute"
+
+ CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"
+
+ def execute(self, image, resolution=512, **kwargs):
+ from custom_controlnet_aux.zoe import ZoeDetector
+
+ model = ZoeDetector.from_pretrained().to(model_management.get_torch_device())
+ out = common_annotator_call(model, image, resolution=resolution)
+ del model
+ return (out, )
+
+NODE_CLASS_MAPPINGS = {
+ "Zoe-DepthMapPreprocessor": Zoe_Depth_Map_Preprocessor
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "Zoe-DepthMapPreprocessor": "Zoe Depth Map"
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/pyproject.toml b/comfyui_controlnet_aux/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..53379a777e21fba0a44f12f0bbb6c94f54d50ccb
--- /dev/null
+++ b/comfyui_controlnet_aux/pyproject.toml
@@ -0,0 +1,14 @@
+[project]
+name = "comfyui_controlnet_aux"
+description = "Plug-and-play ComfyUI node sets for making ControlNet hint images"
+
+version = "1.0.5"
+dependencies = ["torch", "importlib_metadata", "huggingface_hub", "scipy", "opencv-python>=4.7.0.72", "filelock", "numpy", "Pillow", "einops", "torchvision", "pyyaml", "scikit-image", "python-dateutil", "mediapipe", "svglib", "fvcore", "yapf", "omegaconf", "ftfy", "addict", "yacs", "trimesh[easy]", "albumentations", "scikit-learn", "matplotlib"]
+
+[project.urls]
+Repository = "https://github.com/Fannovel16/comfyui_controlnet_aux"
+
+[tool.comfy]
+PublisherId = "fannovel16"
+DisplayName = "comfyui_controlnet_aux"
+Icon = ""
diff --git a/comfyui_controlnet_aux/requirements.txt b/comfyui_controlnet_aux/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..05530ac03fa1525cd6fe05ae219d56127469c256
--- /dev/null
+++ b/comfyui_controlnet_aux/requirements.txt
@@ -0,0 +1,25 @@
+torch
+importlib_metadata
+huggingface_hub
+scipy
+opencv-python>=4.7.0.72
+filelock
+numpy
+Pillow
+einops
+torchvision
+pyyaml
+scikit-image
+python-dateutil
+mediapipe
+svglib
+fvcore
+yapf
+omegaconf
+ftfy
+addict
+yacs
+trimesh[easy]
+albumentations
+scikit-learn
+matplotlib
diff --git a/comfyui_controlnet_aux/search_hf_assets.py b/comfyui_controlnet_aux/search_hf_assets.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf291c0b00425c96484043aed7af1be4165089d
--- /dev/null
+++ b/comfyui_controlnet_aux/search_hf_assets.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+import os
+import re
+#Thanks ChatGPT
+pattern = r'\bfrom_pretrained\(.*?pretrained_model_or_path\s*=\s*(.*?)(?:,|\))|filename\s*=\s*(.*?)(?:,|\))|(\w+_filename)\s*=\s*(.*?)(?:,|\))'
+aux_dir = Path(__file__).parent / 'src' / 'custom_controlnet_aux'
+VAR_DICT = dict(
+ HF_MODEL_NAME = "lllyasviel/Annotators",
+ DWPOSE_MODEL_NAME = "yzd-v/DWPose",
+ BDS_MODEL_NAME = "bdsqlsz/qinglong_controlnet-lllite",
+ DENSEPOSE_MODEL_NAME = "LayerNorm/DensePose-TorchScript-with-hint-image",
+ MESH_GRAPHORMER_MODEL_NAME = "hr16/ControlNet-HandRefiner-pruned",
+ SAM_MODEL_NAME = "dhkim2810/MobileSAM",
+ UNIMATCH_MODEL_NAME = "hr16/Unimatch",
+ DEPTH_ANYTHING_MODEL_NAME = "LiheYoung/Depth-Anything", #HF Space
+ DIFFUSION_EDGE_MODEL_NAME = "hr16/Diffusion-Edge"
+)
+re_result_dict = {}
+for preprocc in os.listdir(aux_dir):
+ if preprocc in ["__pycache__", 'tests']: continue
+ if '.py' in preprocc: continue
+ f = open(aux_dir / preprocc / '__init__.py', 'r')
+ code = f.read()
+ matches = re.findall(pattern, code)
+ result = [match[0] or match[1] or match[3] for match in matches]
+ if not len(result):
+ print(preprocc)
+ continue
+ result = [el.replace("'", '').replace('"', '') for el in result]
+ result = [VAR_DICT.get(el, el) for el in result]
+ re_result_dict[preprocc] = result
+ f.close()
+
+for preprocc, re_result in re_result_dict.items():
+ model_name, filenames = re_result[0], re_result[1:]
+ print(f"* {preprocc}: ", end=' ')
+ assests_md = ', '.join([f"[{model_name}/{filename}](https://huggingface.co/{model_name}/blob/main/{filename})" for filename in filenames])
+ print(assests_md)
+
+preprocc = "dwpose"
+model_name, filenames = VAR_DICT['DWPOSE_MODEL_NAME'], ["yolox_l.onnx", "dw-ll_ucoco_384.onnx"]
+print(f"* {preprocc}: ", end=' ')
+assests_md = ', '.join([f"[{model_name}/{filename}](https://huggingface.co/{model_name}/blob/main/{filename})" for filename in filenames])
+print(assests_md)
+
+preprocc = "yolo-nas"
+model_name, filenames = "hr16/yolo-nas-fp16", ["yolo_nas_l_fp16.onnx", "yolo_nas_m_fp16.onnx", "yolo_nas_s_fp16.onnx"]
+print(f"* {preprocc}: ", end=' ')
+assests_md = ', '.join([f"[{model_name}/{filename}](https://huggingface.co/{model_name}/blob/main/{filename})" for filename in filenames])
+print(assests_md)
+
+preprocc = "dwpose-torchscript"
+model_name, filenames = "hr16/DWPose-TorchScript-BatchSize5", ["dw-ll_ucoco_384_bs5.torchscript.pt", "rtmpose-m_ap10k_256_bs5.torchscript.pt"]
+print(f"* {preprocc}: ", end=' ')
+assests_md = ', '.join([f"[{model_name}/{filename}](https://huggingface.co/{model_name}/blob/main/{filename})" for filename in filenames])
+print(assests_md)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/__init__.py b/comfyui_controlnet_aux/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e7a7f594ef441479257c788e4c0d6e08657fc8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/__init__.py
@@ -0,0 +1 @@
+#Dummy file ensuring this package will be recognized
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/src/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7b9f80b9bac8ed3100ccfc2d90127da461041b6
Binary files /dev/null and b/comfyui_controlnet_aux/src/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/LICENSE b/comfyui_controlnet_aux/src/custom_albumentations/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..cd34eb58b47616f81977e7e247aae235feaff1b6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Buslaev Alexander, Alexander Parinov, Vladimir Iglovikov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c1addc65d05727e88da6a9b60191a25b7616ae
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/__init__.py
@@ -0,0 +1,15 @@
+from __future__ import absolute_import
+
+__version__ = "1.3.1"
+
+from .augmentations import *
+from .core.composition import *
+from .core.serialization import *
+from .core.transforms_interface import *
+
+try:
+ from .imgaug.transforms import * # type: ignore
+except ImportError:
+ # imgaug is not installed by default, so we import stubs.
+ # Run `pip install -U albumentations[imgaug] if you need augmentations from imgaug.`
+ from .imgaug.stubs import * # type: ignore
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eaf982febe219a154ebba1d2de9e34d655bea32
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/__init__.py
@@ -0,0 +1,21 @@
+# Common classes
+from .blur.functional import *
+from .blur.transforms import *
+from .crops.functional import *
+from .crops.transforms import *
+
+# New transformations goes to individual files listed below
+from .domain_adaptation import *
+from .dropout.channel_dropout import *
+from .dropout.coarse_dropout import *
+from .dropout.cutout import *
+from .dropout.functional import *
+from .dropout.grid_dropout import *
+from .dropout.mask_dropout import *
+from .functional import *
+from .geometric.functional import *
+from .geometric.resize import *
+from .geometric.rotate import *
+from .geometric.transforms import *
+from .transforms import *
+from .utils import *
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1913670ea85ddef4691eb595383aec9607efb827
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/__init__.py
@@ -0,0 +1,2 @@
+from .functional import *
+from .transforms import *
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/functional.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29590645fe11e3c0ceffe8f0d3164f1ca9e739c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/functional.py
@@ -0,0 +1,106 @@
+from itertools import product
+from math import ceil
+from typing import Sequence, Union
+
+import cv2
+import numpy as np
+
+from custom_albumentations.augmentations.functional import convolve
+from custom_albumentations.augmentations.geometric.functional import scale
+from custom_albumentations.augmentations.utils import (
+ _maybe_process_in_chunks,
+ clipped,
+ preserve_shape,
+)
+
+__all__ = ["blur", "median_blur", "gaussian_blur", "glass_blur"]
+
+
+@preserve_shape
+def blur(img: np.ndarray, ksize: int) -> np.ndarray:
+ blur_fn = _maybe_process_in_chunks(cv2.blur, ksize=(ksize, ksize))
+ return blur_fn(img)
+
+
+@preserve_shape
+def median_blur(img: np.ndarray, ksize: int) -> np.ndarray:
+ if img.dtype == np.float32 and ksize not in {3, 5}:
+ raise ValueError(f"Invalid ksize value {ksize}. For a float32 image the only valid ksize values are 3 and 5")
+
+ blur_fn = _maybe_process_in_chunks(cv2.medianBlur, ksize=ksize)
+ return blur_fn(img)
+
+
+@preserve_shape
+def gaussian_blur(img: np.ndarray, ksize: int, sigma: float = 0) -> np.ndarray:
+ # When sigma=0, it is computed as `sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`
+ blur_fn = _maybe_process_in_chunks(cv2.GaussianBlur, ksize=(ksize, ksize), sigmaX=sigma)
+ return blur_fn(img)
+
+
+@preserve_shape
+def glass_blur(
+ img: np.ndarray, sigma: float, max_delta: int, iterations: int, dxy: np.ndarray, mode: str
+) -> np.ndarray:
+ x = cv2.GaussianBlur(np.array(img), sigmaX=sigma, ksize=(0, 0))
+
+ if mode == "fast":
+ hs = np.arange(img.shape[0] - max_delta, max_delta, -1)
+ ws = np.arange(img.shape[1] - max_delta, max_delta, -1)
+ h: Union[int, np.ndarray] = np.tile(hs, ws.shape[0])
+ w: Union[int, np.ndarray] = np.repeat(ws, hs.shape[0])
+
+ for i in range(iterations):
+ dy = dxy[:, i, 0]
+ dx = dxy[:, i, 1]
+ x[h, w], x[h + dy, w + dx] = x[h + dy, w + dx], x[h, w]
+
+ elif mode == "exact":
+ for ind, (i, h, w) in enumerate(
+ product(
+ range(iterations),
+ range(img.shape[0] - max_delta, max_delta, -1),
+ range(img.shape[1] - max_delta, max_delta, -1),
+ )
+ ):
+ ind = ind if ind < len(dxy) else ind % len(dxy)
+ dy = dxy[ind, i, 0]
+ dx = dxy[ind, i, 1]
+ x[h, w], x[h + dy, w + dx] = x[h + dy, w + dx], x[h, w]
+ else:
+ ValueError(f"Unsupported mode `{mode}`. Supports only `fast` and `exact`.")
+
+ return cv2.GaussianBlur(x, sigmaX=sigma, ksize=(0, 0))
+
+
+def defocus(img: np.ndarray, radius: int, alias_blur: float) -> np.ndarray:
+ length = np.arange(-max(8, radius), max(8, radius) + 1)
+ ksize = 3 if radius <= 8 else 5
+
+ x, y = np.meshgrid(length, length)
+ aliased_disk = np.array((x**2 + y**2) <= radius**2, dtype=np.float32)
+ aliased_disk /= np.sum(aliased_disk)
+
+ kernel = gaussian_blur(aliased_disk, ksize, sigma=alias_blur)
+ return convolve(img, kernel=kernel)
+
+
+def central_zoom(img: np.ndarray, zoom_factor: int) -> np.ndarray:
+ h, w = img.shape[:2]
+ h_ch, w_ch = ceil(h / zoom_factor), ceil(w / zoom_factor)
+ h_top, w_top = (h - h_ch) // 2, (w - w_ch) // 2
+
+ img = scale(img[h_top : h_top + h_ch, w_top : w_top + w_ch], zoom_factor, cv2.INTER_LINEAR)
+ h_trim_top, w_trim_top = (img.shape[0] - h) // 2, (img.shape[1] - w) // 2
+ return img[h_trim_top : h_trim_top + h, w_trim_top : w_trim_top + w]
+
+
+@clipped
+def zoom_blur(img: np.ndarray, zoom_factors: Union[np.ndarray, Sequence[int]]) -> np.ndarray:
+ out = np.zeros_like(img, dtype=np.float32)
+ for zoom_factor in zoom_factors:
+ out += central_zoom(img, zoom_factor)
+
+ img = ((img + out) / (len(zoom_factors) + 1)).astype(img.dtype)
+
+ return img
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/transforms.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f380cff402f84aa617dfb76a663748ead13f87
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/blur/transforms.py
@@ -0,0 +1,486 @@
+import random
+import warnings
+from typing import Any, Dict, List, Sequence, Tuple
+
+import cv2
+import numpy as np
+
+from custom_albumentations import random_utils
+from custom_albumentations.augmentations import functional as FMain
+from custom_albumentations.augmentations.blur import functional as F
+from custom_albumentations.core.transforms_interface import (
+ ImageOnlyTransform,
+ ScaleFloatType,
+ ScaleIntType,
+ to_tuple,
+)
+
+__all__ = ["Blur", "MotionBlur", "GaussianBlur", "GlassBlur", "AdvancedBlur", "MedianBlur", "Defocus", "ZoomBlur"]
+
+
+class Blur(ImageOnlyTransform):
+ """Blur the input image using a random-sized kernel.
+
+ Args:
+ blur_limit (int, (int, int)): maximum kernel size for blurring the input image.
+ Should be in range [3, inf). Default: (3, 7).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, blur_limit: ScaleIntType = 7, always_apply: bool = False, p: float = 0.5):
+ super().__init__(always_apply, p)
+ self.blur_limit = to_tuple(blur_limit, 3)
+
+ def apply(self, img: np.ndarray, ksize: int = 3, **params) -> np.ndarray:
+ return F.blur(img, ksize)
+
+ def get_params(self) -> Dict[str, Any]:
+ return {"ksize": int(random.choice(list(range(self.blur_limit[0], self.blur_limit[1] + 1, 2))))}
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return ("blur_limit",)
+
+
+class MotionBlur(Blur):
+ """Apply motion blur to the input image using a random-sized kernel.
+
+ Args:
+ blur_limit (int): maximum kernel size for blurring the input image.
+ Should be in range [3, inf). Default: (3, 7).
+ allow_shifted (bool): if set to true creates non shifted kernels only,
+ otherwise creates randomly shifted kernels. Default: True.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ blur_limit: ScaleIntType = 7,
+ allow_shifted: bool = True,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(blur_limit=blur_limit, always_apply=always_apply, p=p)
+ self.allow_shifted = allow_shifted
+
+ if not allow_shifted and self.blur_limit[0] % 2 != 1 or self.blur_limit[1] % 2 != 1:
+ raise ValueError(f"Blur limit must be odd when centered=True. Got: {self.blur_limit}")
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return super().get_transform_init_args_names() + ("allow_shifted",)
+
+ def apply(self, img: np.ndarray, kernel: np.ndarray = None, **params) -> np.ndarray: # type: ignore
+ return FMain.convolve(img, kernel=kernel)
+
+ def get_params(self) -> Dict[str, Any]:
+ ksize = random.choice(list(range(self.blur_limit[0], self.blur_limit[1] + 1, 2)))
+ if ksize <= 2:
+ raise ValueError("ksize must be > 2. Got: {}".format(ksize))
+ kernel = np.zeros((ksize, ksize), dtype=np.uint8)
+ x1, x2 = random.randint(0, ksize - 1), random.randint(0, ksize - 1)
+ if x1 == x2:
+ y1, y2 = random.sample(range(ksize), 2)
+ else:
+ y1, y2 = random.randint(0, ksize - 1), random.randint(0, ksize - 1)
+
+ def make_odd_val(v1, v2):
+ len_v = abs(v1 - v2) + 1
+ if len_v % 2 != 1:
+ if v2 > v1:
+ v2 -= 1
+ else:
+ v1 -= 1
+ return v1, v2
+
+ if not self.allow_shifted:
+ x1, x2 = make_odd_val(x1, x2)
+ y1, y2 = make_odd_val(y1, y2)
+
+ xc = (x1 + x2) / 2
+ yc = (y1 + y2) / 2
+
+ center = ksize / 2 - 0.5
+ dx = xc - center
+ dy = yc - center
+ x1, x2 = [int(i - dx) for i in [x1, x2]]
+ y1, y2 = [int(i - dy) for i in [y1, y2]]
+
+ cv2.line(kernel, (x1, y1), (x2, y2), 1, thickness=1)
+
+ # Normalize kernel
+ return {"kernel": kernel.astype(np.float32) / np.sum(kernel)}
+
+
+class MedianBlur(Blur):
+ """Blur the input image using a median filter with a random aperture linear size.
+
+ Args:
+ blur_limit (int): maximum aperture linear size for blurring the input image.
+ Must be odd and in range [3, inf). Default: (3, 7).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, blur_limit: ScaleIntType = 7, always_apply: bool = False, p: float = 0.5):
+ super().__init__(blur_limit, always_apply, p)
+
+ if self.blur_limit[0] % 2 != 1 or self.blur_limit[1] % 2 != 1:
+ raise ValueError("MedianBlur supports only odd blur limits.")
+
+ def apply(self, img: np.ndarray, ksize: int = 3, **params) -> np.ndarray:
+ return F.median_blur(img, ksize)
+
+
+class GaussianBlur(ImageOnlyTransform):
+ """Blur the input image using a Gaussian filter with a random kernel size.
+
+ Args:
+ blur_limit (int, (int, int)): maximum Gaussian kernel size for blurring the input image.
+ Must be zero or odd and in range [0, inf). If set to 0 it will be computed from sigma
+ as `round(sigma * (3 if img.dtype == np.uint8 else 4) * 2 + 1) + 1`.
+ If set single value `blur_limit` will be in range (0, blur_limit).
+ Default: (3, 7).
+ sigma_limit (float, (float, float)): Gaussian kernel standard deviation. Must be in range [0, inf).
+ If set single value `sigma_limit` will be in range (0, sigma_limit).
+ If set to 0 sigma will be computed as `sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`. Default: 0.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ blur_limit: ScaleIntType = (3, 7),
+ sigma_limit: ScaleFloatType = 0,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply, p)
+ self.blur_limit = to_tuple(blur_limit, 0)
+ self.sigma_limit = to_tuple(sigma_limit if sigma_limit is not None else 0, 0)
+
+ if self.blur_limit[0] == 0 and self.sigma_limit[0] == 0:
+ self.blur_limit = 3, max(3, self.blur_limit[1])
+ warnings.warn(
+ "blur_limit and sigma_limit minimum value can not be both equal to 0. "
+ "blur_limit minimum value changed to 3."
+ )
+
+ if (self.blur_limit[0] != 0 and self.blur_limit[0] % 2 != 1) or (
+ self.blur_limit[1] != 0 and self.blur_limit[1] % 2 != 1
+ ):
+ raise ValueError("GaussianBlur supports only odd blur limits.")
+
+ def apply(self, img: np.ndarray, ksize: int = 3, sigma: float = 0, **params) -> np.ndarray:
+ return F.gaussian_blur(img, ksize, sigma=sigma)
+
+ def get_params(self) -> Dict[str, float]:
+ ksize = random.randrange(self.blur_limit[0], self.blur_limit[1] + 1)
+ if ksize != 0 and ksize % 2 != 1:
+ ksize = (ksize + 1) % (self.blur_limit[1] + 1)
+
+ return {"ksize": ksize, "sigma": random.uniform(*self.sigma_limit)}
+
+ def get_transform_init_args_names(self) -> Tuple[str, str]:
+ return ("blur_limit", "sigma_limit")
+
+
+class GlassBlur(Blur):
+ """Apply glass noise to the input image.
+
+ Args:
+ sigma (float): standard deviation for Gaussian kernel.
+ max_delta (int): max distance between pixels which are swapped.
+ iterations (int): number of repeats.
+ Should be in range [1, inf). Default: (2).
+ mode (str): mode of computation: fast or exact. Default: "fast".
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+
+ Reference:
+ | https://arxiv.org/abs/1903.12261
+ | https://github.com/hendrycks/robustness/blob/master/ImageNet-C/create_c/make_imagenet_c.py
+ """
+
+ def __init__(
+ self,
+ sigma: float = 0.7,
+ max_delta: int = 4,
+ iterations: int = 2,
+ always_apply: bool = False,
+ mode: str = "fast",
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply=always_apply, p=p)
+ if iterations < 1:
+ raise ValueError(f"Iterations should be more or equal to 1, but we got {iterations}")
+
+ if mode not in ["fast", "exact"]:
+ raise ValueError(f"Mode should be 'fast' or 'exact', but we got {mode}")
+
+ self.sigma = sigma
+ self.max_delta = max_delta
+ self.iterations = iterations
+ self.mode = mode
+
+ def apply(self, img: np.ndarray, dxy: np.ndarray = None, **params) -> np.ndarray: # type: ignore
+ assert dxy is not None
+ return F.glass_blur(img, self.sigma, self.max_delta, self.iterations, dxy, self.mode)
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, np.ndarray]:
+ img = params["image"]
+
+ # generate array containing all necessary values for transformations
+ width_pixels = img.shape[0] - self.max_delta * 2
+ height_pixels = img.shape[1] - self.max_delta * 2
+ total_pixels = width_pixels * height_pixels
+ dxy = random_utils.randint(-self.max_delta, self.max_delta, size=(total_pixels, self.iterations, 2))
+
+ return {"dxy": dxy}
+
+ def get_transform_init_args_names(self) -> Tuple[str, str, str]:
+ return ("sigma", "max_delta", "iterations")
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return ["image"]
+
+
+class AdvancedBlur(ImageOnlyTransform):
+ """Blur the input image using a Generalized Normal filter with a randomly selected parameters.
+ This transform also adds multiplicative noise to generated kernel before convolution.
+
+ Args:
+ blur_limit: maximum Gaussian kernel size for blurring the input image.
+ Must be zero or odd and in range [0, inf). If set to 0 it will be computed from sigma
+ as `round(sigma * (3 if img.dtype == np.uint8 else 4) * 2 + 1) + 1`.
+ If set single value `blur_limit` will be in range (0, blur_limit).
+ Default: (3, 7).
+ sigmaX_limit: Gaussian kernel standard deviation. Must be in range [0, inf).
+ If set single value `sigmaX_limit` will be in range (0, sigma_limit).
+ If set to 0 sigma will be computed as `sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`. Default: 0.
+ sigmaY_limit: Same as `sigmaY_limit` for another dimension.
+ rotate_limit: Range from which a random angle used to rotate Gaussian kernel is picked.
+ If limit is a single int an angle is picked from (-rotate_limit, rotate_limit). Default: (-90, 90).
+ beta_limit: Distribution shape parameter, 1 is the normal distribution. Values below 1.0 make distribution
+ tails heavier than normal, values above 1.0 make it lighter than normal. Default: (0.5, 8.0).
+ noise_limit: Multiplicative factor that control strength of kernel noise. Must be positive and preferably
+ centered around 1.0. If set single value `noise_limit` will be in range (0, noise_limit).
+ Default: (0.75, 1.25).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Reference:
+ https://arxiv.org/abs/2107.10833
+
+ Targets:
+ image
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ blur_limit: ScaleIntType = (3, 7),
+ sigmaX_limit: ScaleFloatType = (0.2, 1.0),
+ sigmaY_limit: ScaleFloatType = (0.2, 1.0),
+ rotate_limit: ScaleIntType = 90,
+ beta_limit: ScaleFloatType = (0.5, 8.0),
+ noise_limit: ScaleFloatType = (0.9, 1.1),
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply, p)
+ self.blur_limit = to_tuple(blur_limit, 3)
+ self.sigmaX_limit = self.__check_values(to_tuple(sigmaX_limit, 0.0), name="sigmaX_limit")
+ self.sigmaY_limit = self.__check_values(to_tuple(sigmaY_limit, 0.0), name="sigmaY_limit")
+ self.rotate_limit = to_tuple(rotate_limit)
+ self.beta_limit = to_tuple(beta_limit, low=0.0)
+ self.noise_limit = self.__check_values(to_tuple(noise_limit, 0.0), name="noise_limit")
+
+ if (self.blur_limit[0] != 0 and self.blur_limit[0] % 2 != 1) or (
+ self.blur_limit[1] != 0 and self.blur_limit[1] % 2 != 1
+ ):
+ raise ValueError("AdvancedBlur supports only odd blur limits.")
+
+ if self.sigmaX_limit[0] == 0 and self.sigmaY_limit[0] == 0:
+ raise ValueError("sigmaX_limit and sigmaY_limit minimum value can not be both equal to 0.")
+
+ if not (self.beta_limit[0] < 1.0 < self.beta_limit[1]):
+ raise ValueError("Beta limit is expected to include 1.0")
+
+ @staticmethod
+ def __check_values(
+ value: Sequence[float], name: str, bounds: Tuple[float, float] = (0, float("inf"))
+ ) -> Sequence[float]:
+ if not bounds[0] <= value[0] <= value[1] <= bounds[1]:
+ raise ValueError(f"{name} values should be between {bounds}")
+ return value
+
+ def apply(self, img: np.ndarray, kernel: np.ndarray = np.array(None), **params) -> np.ndarray:
+ return FMain.convolve(img, kernel=kernel)
+
+ def get_params(self) -> Dict[str, np.ndarray]:
+ ksize = random.randrange(self.blur_limit[0], self.blur_limit[1] + 1, 2)
+ sigmaX = random.uniform(*self.sigmaX_limit)
+ sigmaY = random.uniform(*self.sigmaY_limit)
+ angle = np.deg2rad(random.uniform(*self.rotate_limit))
+
+ # Split into 2 cases to avoid selection of narrow kernels (beta > 1) too often.
+ if random.random() < 0.5:
+ beta = random.uniform(self.beta_limit[0], 1)
+ else:
+ beta = random.uniform(1, self.beta_limit[1])
+
+ noise_matrix = random_utils.uniform(self.noise_limit[0], self.noise_limit[1], size=[ksize, ksize])
+
+ # Generate mesh grid centered at zero.
+ ax = np.arange(-ksize // 2 + 1.0, ksize // 2 + 1.0)
+ # Shape (ksize, ksize, 2)
+ grid = np.stack(np.meshgrid(ax, ax), axis=-1)
+
+ # Calculate rotated sigma matrix
+ d_matrix = np.array([[sigmaX**2, 0], [0, sigmaY**2]])
+ u_matrix = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]])
+ sigma_matrix = np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
+
+ inverse_sigma = np.linalg.inv(sigma_matrix)
+ # Described in "Parameter Estimation For Multivariate Generalized Gaussian Distributions"
+ kernel = np.exp(-0.5 * np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta))
+ # Add noise
+ kernel = kernel * noise_matrix
+
+ # Normalize kernel
+ kernel = kernel.astype(np.float32) / np.sum(kernel)
+ return {"kernel": kernel}
+
+ def get_transform_init_args_names(self) -> Tuple[str, str, str, str, str, str]:
+ return (
+ "blur_limit",
+ "sigmaX_limit",
+ "sigmaY_limit",
+ "rotate_limit",
+ "beta_limit",
+ "noise_limit",
+ )
+
+
+class Defocus(ImageOnlyTransform):
+ """
+ Apply defocus transform. See https://arxiv.org/abs/1903.12261.
+
+ Args:
+ radius ((int, int) or int): range for radius of defocusing.
+ If limit is a single int, the range will be [1, limit]. Default: (3, 10).
+ alias_blur ((float, float) or float): range for alias_blur of defocusing (sigma of gaussian blur).
+ If limit is a single float, the range will be (0, limit). Default: (0.1, 0.5).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ Any
+ """
+
+ def __init__(
+ self,
+ radius: ScaleIntType = (3, 10),
+ alias_blur: ScaleFloatType = (0.1, 0.5),
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply, p)
+ self.radius = to_tuple(radius, low=1)
+ self.alias_blur = to_tuple(alias_blur, low=0)
+
+ if self.radius[0] <= 0:
+ raise ValueError("Parameter radius must be positive")
+
+ if self.alias_blur[0] < 0:
+ raise ValueError("Parameter alias_blur must be non-negative")
+
+ def apply(self, img: np.ndarray, radius: int = 3, alias_blur: float = 0.5, **params) -> np.ndarray:
+ return F.defocus(img, radius, alias_blur)
+
+ def get_params(self) -> Dict[str, Any]:
+ return {
+ "radius": random_utils.randint(self.radius[0], self.radius[1] + 1),
+ "alias_blur": random_utils.uniform(self.alias_blur[0], self.alias_blur[1]),
+ }
+
+ def get_transform_init_args_names(self) -> Tuple[str, str]:
+ return ("radius", "alias_blur")
+
+
+class ZoomBlur(ImageOnlyTransform):
+ """
+ Apply zoom blur transform. See https://arxiv.org/abs/1903.12261.
+
+ Args:
+ max_factor ((float, float) or float): range for max factor for blurring.
+ If max_factor is a single float, the range will be (1, limit). Default: (1, 1.31).
+ All max_factor values should be larger than 1.
+ step_factor ((float, float) or float): If single float will be used as step parameter for np.arange.
+ If tuple of float step_factor will be in range `[step_factor[0], step_factor[1])`. Default: (0.01, 0.03).
+ All step_factor values should be positive.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ Any
+ """
+
+ def __init__(
+ self,
+ max_factor: ScaleFloatType = 1.31,
+ step_factor: ScaleFloatType = (0.01, 0.03),
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply, p)
+ self.max_factor = to_tuple(max_factor, low=1.0)
+ self.step_factor = to_tuple(step_factor, step_factor)
+
+ if self.max_factor[0] < 1:
+ raise ValueError("Max factor must be larger or equal 1")
+ if self.step_factor[0] <= 0:
+ raise ValueError("Step factor must be positive")
+
+ def apply(self, img: np.ndarray, zoom_factors: np.ndarray = np.array(None), **params) -> np.ndarray:
+ assert zoom_factors is not None
+ return F.zoom_blur(img, zoom_factors)
+
+ def get_params(self) -> Dict[str, Any]:
+ max_factor = random.uniform(self.max_factor[0], self.max_factor[1])
+ step_factor = random.uniform(self.step_factor[0], self.step_factor[1])
+ return {"zoom_factors": np.arange(1.0, max_factor, step_factor)}
+
+ def get_transform_init_args_names(self) -> Tuple[str, str]:
+ return ("max_factor", "step_factor")
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1913670ea85ddef4691eb595383aec9607efb827
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/__init__.py
@@ -0,0 +1,2 @@
+from .functional import *
+from .transforms import *
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/functional.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb34664f8a67fd7c6c162db8c398780067e9de1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/functional.py
@@ -0,0 +1,317 @@
+from typing import Optional, Sequence, Tuple
+
+import cv2
+import numpy as np
+
+from custom_albumentations.augmentations.utils import (
+ _maybe_process_in_chunks,
+ preserve_channel_dim,
+)
+
+from ...core.bbox_utils import denormalize_bbox, normalize_bbox
+from ...core.transforms_interface import BoxInternalType, KeypointInternalType
+from ..geometric import functional as FGeometric
+
+__all__ = [
+ "get_random_crop_coords",
+ "random_crop",
+ "crop_bbox_by_coords",
+ "bbox_random_crop",
+ "crop_keypoint_by_coords",
+ "keypoint_random_crop",
+ "get_center_crop_coords",
+ "center_crop",
+ "bbox_center_crop",
+ "keypoint_center_crop",
+ "crop",
+ "bbox_crop",
+ "clamping_crop",
+ "crop_and_pad",
+ "crop_and_pad_bbox",
+ "crop_and_pad_keypoint",
+]
+
+
+def get_random_crop_coords(height: int, width: int, crop_height: int, crop_width: int, h_start: float, w_start: float):
+ # h_start is [0, 1) and should map to [0, (height - crop_height)] (note inclusive)
+ # This is conceptually equivalent to mapping onto `range(0, (height - crop_height + 1))`
+ # See: https://github.com/albumentations-team/albumentations/pull/1080
+ y1 = int((height - crop_height + 1) * h_start)
+ y2 = y1 + crop_height
+ x1 = int((width - crop_width + 1) * w_start)
+ x2 = x1 + crop_width
+ return x1, y1, x2, y2
+
+
+def random_crop(img: np.ndarray, crop_height: int, crop_width: int, h_start: float, w_start: float):
+ height, width = img.shape[:2]
+ if height < crop_height or width < crop_width:
+ raise ValueError(
+ "Requested crop size ({crop_height}, {crop_width}) is "
+ "larger than the image size ({height}, {width})".format(
+ crop_height=crop_height, crop_width=crop_width, height=height, width=width
+ )
+ )
+ x1, y1, x2, y2 = get_random_crop_coords(height, width, crop_height, crop_width, h_start, w_start)
+ img = img[y1:y2, x1:x2]
+ return img
+
+
+def crop_bbox_by_coords(
+ bbox: BoxInternalType,
+ crop_coords: Tuple[int, int, int, int],
+ crop_height: int,
+ crop_width: int,
+ rows: int,
+ cols: int,
+):
+ """Crop a bounding box using the provided coordinates of bottom-left and top-right corners in pixels and the
+ required height and width of the crop.
+
+ Args:
+ bbox (tuple): A cropped box `(x_min, y_min, x_max, y_max)`.
+ crop_coords (tuple): Crop coordinates `(x1, y1, x2, y2)`.
+ crop_height (int):
+ crop_width (int):
+ rows (int): Image rows.
+ cols (int): Image cols.
+
+ Returns:
+ tuple: A cropped bounding box `(x_min, y_min, x_max, y_max)`.
+
+ """
+ bbox = denormalize_bbox(bbox, rows, cols)
+ x_min, y_min, x_max, y_max = bbox[:4]
+ x1, y1, _, _ = crop_coords
+ cropped_bbox = x_min - x1, y_min - y1, x_max - x1, y_max - y1
+ return normalize_bbox(cropped_bbox, crop_height, crop_width)
+
+
+def bbox_random_crop(
+ bbox: BoxInternalType, crop_height: int, crop_width: int, h_start: float, w_start: float, rows: int, cols: int
+):
+ crop_coords = get_random_crop_coords(rows, cols, crop_height, crop_width, h_start, w_start)
+ return crop_bbox_by_coords(bbox, crop_coords, crop_height, crop_width, rows, cols)
+
+
+def crop_keypoint_by_coords(
+ keypoint: KeypointInternalType, crop_coords: Tuple[int, int, int, int]
+): # skipcq: PYL-W0613
+ """Crop a keypoint using the provided coordinates of bottom-left and top-right corners in pixels and the
+ required height and width of the crop.
+
+ Args:
+ keypoint (tuple): A keypoint `(x, y, angle, scale)`.
+ crop_coords (tuple): Crop box coords `(x1, x2, y1, y2)`.
+
+ Returns:
+ A keypoint `(x, y, angle, scale)`.
+
+ """
+ x, y, angle, scale = keypoint[:4]
+ x1, y1, _, _ = crop_coords
+ return x - x1, y - y1, angle, scale
+
+
+def keypoint_random_crop(
+ keypoint: KeypointInternalType,
+ crop_height: int,
+ crop_width: int,
+ h_start: float,
+ w_start: float,
+ rows: int,
+ cols: int,
+):
+ """Keypoint random crop.
+
+ Args:
+ keypoint: (tuple): A keypoint `(x, y, angle, scale)`.
+ crop_height (int): Crop height.
+ crop_width (int): Crop width.
+ h_start (int): Crop height start.
+ w_start (int): Crop width start.
+ rows (int): Image height.
+ cols (int): Image width.
+
+ Returns:
+ A keypoint `(x, y, angle, scale)`.
+
+ """
+ crop_coords = get_random_crop_coords(rows, cols, crop_height, crop_width, h_start, w_start)
+ return crop_keypoint_by_coords(keypoint, crop_coords)
+
+
+def get_center_crop_coords(height: int, width: int, crop_height: int, crop_width: int):
+ y1 = (height - crop_height) // 2
+ y2 = y1 + crop_height
+ x1 = (width - crop_width) // 2
+ x2 = x1 + crop_width
+ return x1, y1, x2, y2
+
+
+def center_crop(img: np.ndarray, crop_height: int, crop_width: int):
+ height, width = img.shape[:2]
+ if height < crop_height or width < crop_width:
+ raise ValueError(
+ "Requested crop size ({crop_height}, {crop_width}) is "
+ "larger than the image size ({height}, {width})".format(
+ crop_height=crop_height, crop_width=crop_width, height=height, width=width
+ )
+ )
+ x1, y1, x2, y2 = get_center_crop_coords(height, width, crop_height, crop_width)
+ img = img[y1:y2, x1:x2]
+ return img
+
+
+def bbox_center_crop(bbox: BoxInternalType, crop_height: int, crop_width: int, rows: int, cols: int):
+ crop_coords = get_center_crop_coords(rows, cols, crop_height, crop_width)
+ return crop_bbox_by_coords(bbox, crop_coords, crop_height, crop_width, rows, cols)
+
+
+def keypoint_center_crop(keypoint: KeypointInternalType, crop_height: int, crop_width: int, rows: int, cols: int):
+ """Keypoint center crop.
+
+ Args:
+ keypoint (tuple): A keypoint `(x, y, angle, scale)`.
+ crop_height (int): Crop height.
+ crop_width (int): Crop width.
+ rows (int): Image height.
+ cols (int): Image width.
+
+ Returns:
+ tuple: A keypoint `(x, y, angle, scale)`.
+
+ """
+ crop_coords = get_center_crop_coords(rows, cols, crop_height, crop_width)
+ return crop_keypoint_by_coords(keypoint, crop_coords)
+
+
+def crop(img: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int):
+ height, width = img.shape[:2]
+ if x_max <= x_min or y_max <= y_min:
+ raise ValueError(
+ "We should have x_min < x_max and y_min < y_max. But we got"
+ " (x_min = {x_min}, y_min = {y_min}, x_max = {x_max}, y_max = {y_max})".format(
+ x_min=x_min, x_max=x_max, y_min=y_min, y_max=y_max
+ )
+ )
+
+ if x_min < 0 or x_max > width or y_min < 0 or y_max > height:
+ raise ValueError(
+ "Values for crop should be non negative and equal or smaller than image sizes"
+ "(x_min = {x_min}, y_min = {y_min}, x_max = {x_max}, y_max = {y_max}, "
+ "height = {height}, width = {width})".format(
+ x_min=x_min, x_max=x_max, y_min=y_min, y_max=y_max, height=height, width=width
+ )
+ )
+
+ return img[y_min:y_max, x_min:x_max]
+
+
+def bbox_crop(bbox: BoxInternalType, x_min: int, y_min: int, x_max: int, y_max: int, rows: int, cols: int):
+ """Crop a bounding box.
+
+ Args:
+ bbox (tuple): A bounding box `(x_min, y_min, x_max, y_max)`.
+ x_min (int):
+ y_min (int):
+ x_max (int):
+ y_max (int):
+ rows (int): Image rows.
+ cols (int): Image cols.
+
+ Returns:
+ tuple: A cropped bounding box `(x_min, y_min, x_max, y_max)`.
+
+ """
+ crop_coords = x_min, y_min, x_max, y_max
+ crop_height = y_max - y_min
+ crop_width = x_max - x_min
+ return crop_bbox_by_coords(bbox, crop_coords, crop_height, crop_width, rows, cols)
+
+
+def clamping_crop(img: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int):
+ h, w = img.shape[:2]
+ if x_min < 0:
+ x_min = 0
+ if y_min < 0:
+ y_min = 0
+ if y_max >= h:
+ y_max = h - 1
+ if x_max >= w:
+ x_max = w - 1
+ return img[int(y_min) : int(y_max), int(x_min) : int(x_max)]
+
+
+@preserve_channel_dim
+def crop_and_pad(
+ img: np.ndarray,
+ crop_params: Optional[Sequence[int]],
+ pad_params: Optional[Sequence[int]],
+ pad_value: Optional[float],
+ rows: int,
+ cols: int,
+ interpolation: int,
+ pad_mode: int,
+ keep_size: bool,
+) -> np.ndarray:
+ if crop_params is not None and any(i != 0 for i in crop_params):
+ img = crop(img, *crop_params)
+ if pad_params is not None and any(i != 0 for i in pad_params):
+ img = FGeometric.pad_with_params(
+ img, pad_params[0], pad_params[1], pad_params[2], pad_params[3], border_mode=pad_mode, value=pad_value
+ )
+
+ if keep_size:
+ resize_fn = _maybe_process_in_chunks(cv2.resize, dsize=(cols, rows), interpolation=interpolation)
+ img = resize_fn(img)
+
+ return img
+
+
+def crop_and_pad_bbox(
+ bbox: BoxInternalType,
+ crop_params: Optional[Sequence[int]],
+ pad_params: Optional[Sequence[int]],
+ rows,
+ cols,
+ result_rows,
+ result_cols,
+) -> BoxInternalType:
+ x1, y1, x2, y2 = denormalize_bbox(bbox, rows, cols)[:4]
+
+ if crop_params is not None:
+ crop_x, crop_y = crop_params[:2]
+ x1, y1, x2, y2 = x1 - crop_x, y1 - crop_y, x2 - crop_x, y2 - crop_y
+ if pad_params is not None:
+ top, bottom, left, right = pad_params
+ x1, y1, x2, y2 = x1 + left, y1 + top, x2 + left, y2 + top
+
+ return normalize_bbox((x1, y1, x2, y2), result_rows, result_cols)
+
+
+def crop_and_pad_keypoint(
+ keypoint: KeypointInternalType,
+ crop_params: Optional[Sequence[int]],
+ pad_params: Optional[Sequence[int]],
+ rows: int,
+ cols: int,
+ result_rows: int,
+ result_cols: int,
+ keep_size: bool,
+) -> KeypointInternalType:
+ x, y, angle, scale = keypoint[:4]
+
+ if crop_params is not None:
+ crop_x1, crop_y1, crop_x2, crop_y2 = crop_params
+ x, y = x - crop_x1, y - crop_y1
+ if pad_params is not None:
+ top, bottom, left, right = pad_params
+ x, y = x + left, y + top
+
+ if keep_size and (result_cols != cols or result_rows != rows):
+ scale_x = cols / result_cols
+ scale_y = rows / result_rows
+ return FGeometric.keypoint_scale((x, y, angle, scale), scale_x, scale_y)
+
+ return x, y, angle, scale
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/transforms.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..c417c0e4c617bd59e3a015685786849b0c2fbbcf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/crops/transforms.py
@@ -0,0 +1,943 @@
+import math
+import random
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+
+from custom_albumentations.core.bbox_utils import union_of_bboxes
+
+from ...core.transforms_interface import (
+ BoxInternalType,
+ DualTransform,
+ KeypointInternalType,
+ to_tuple,
+)
+from ..geometric import functional as FGeometric
+from . import functional as F
+
+__all__ = [
+ "RandomCrop",
+ "CenterCrop",
+ "Crop",
+ "CropNonEmptyMaskIfExists",
+ "RandomSizedCrop",
+ "RandomResizedCrop",
+ "RandomCropNearBBox",
+ "RandomSizedBBoxSafeCrop",
+ "CropAndPad",
+ "RandomCropFromBorders",
+ "BBoxSafeRandomCrop",
+]
+
+
+class RandomCrop(DualTransform):
+ """Crop a random part of the input.
+
+ Args:
+ height (int): height of the crop.
+ width (int): width of the crop.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, height, width, always_apply=False, p=1.0):
+ super().__init__(always_apply, p)
+ self.height = height
+ self.width = width
+
+ def apply(self, img, h_start=0, w_start=0, **params):
+ return F.random_crop(img, self.height, self.width, h_start, w_start)
+
+ def get_params(self):
+ return {"h_start": random.random(), "w_start": random.random()}
+
+ def apply_to_bbox(self, bbox, **params):
+ return F.bbox_random_crop(bbox, self.height, self.width, **params)
+
+ def apply_to_keypoint(self, keypoint, **params):
+ return F.keypoint_random_crop(keypoint, self.height, self.width, **params)
+
+ def get_transform_init_args_names(self):
+ return ("height", "width")
+
+
+class CenterCrop(DualTransform):
+ """Crop the central part of the input.
+
+ Args:
+ height (int): height of the crop.
+ width (int): width of the crop.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+
+ Note:
+ It is recommended to use uint8 images as input.
+ Otherwise the operation will require internal conversion
+ float32 -> uint8 -> float32 that causes worse performance.
+ """
+
+ def __init__(self, height, width, always_apply=False, p=1.0):
+ super(CenterCrop, self).__init__(always_apply, p)
+ self.height = height
+ self.width = width
+
+ def apply(self, img, **params):
+ return F.center_crop(img, self.height, self.width)
+
+ def apply_to_bbox(self, bbox, **params):
+ return F.bbox_center_crop(bbox, self.height, self.width, **params)
+
+ def apply_to_keypoint(self, keypoint, **params):
+ return F.keypoint_center_crop(keypoint, self.height, self.width, **params)
+
+ def get_transform_init_args_names(self):
+ return ("height", "width")
+
+
+class Crop(DualTransform):
+ """Crop region from image.
+
+ Args:
+ x_min (int): Minimum upper left x coordinate.
+ y_min (int): Minimum upper left y coordinate.
+ x_max (int): Maximum lower right x coordinate.
+ y_max (int): Maximum lower right y coordinate.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, x_min=0, y_min=0, x_max=1024, y_max=1024, always_apply=False, p=1.0):
+ super(Crop, self).__init__(always_apply, p)
+ self.x_min = x_min
+ self.y_min = y_min
+ self.x_max = x_max
+ self.y_max = y_max
+
+ def apply(self, img, **params):
+ return F.crop(img, x_min=self.x_min, y_min=self.y_min, x_max=self.x_max, y_max=self.y_max)
+
+ def apply_to_bbox(self, bbox, **params):
+ return F.bbox_crop(bbox, x_min=self.x_min, y_min=self.y_min, x_max=self.x_max, y_max=self.y_max, **params)
+
+ def apply_to_keypoint(self, keypoint, **params):
+ return F.crop_keypoint_by_coords(keypoint, crop_coords=(self.x_min, self.y_min, self.x_max, self.y_max))
+
+ def get_transform_init_args_names(self):
+ return ("x_min", "y_min", "x_max", "y_max")
+
+
+class CropNonEmptyMaskIfExists(DualTransform):
+ """Crop area with mask if mask is non-empty, else make random crop.
+
+ Args:
+ height (int): vertical size of crop in pixels
+ width (int): horizontal size of crop in pixels
+ ignore_values (list of int): values to ignore in mask, `0` values are always ignored
+ (e.g. if background value is 5 set `ignore_values=[5]` to ignore)
+ ignore_channels (list of int): channels to ignore in mask
+ (e.g. if background is a first channel set `ignore_channels=[0]` to ignore)
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, height, width, ignore_values=None, ignore_channels=None, always_apply=False, p=1.0):
+ super(CropNonEmptyMaskIfExists, self).__init__(always_apply, p)
+
+ if ignore_values is not None and not isinstance(ignore_values, list):
+ raise ValueError("Expected `ignore_values` of type `list`, got `{}`".format(type(ignore_values)))
+ if ignore_channels is not None and not isinstance(ignore_channels, list):
+ raise ValueError("Expected `ignore_channels` of type `list`, got `{}`".format(type(ignore_channels)))
+
+ self.height = height
+ self.width = width
+ self.ignore_values = ignore_values
+ self.ignore_channels = ignore_channels
+
+ def apply(self, img, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ return F.crop(img, x_min, y_min, x_max, y_max)
+
+ def apply_to_bbox(self, bbox, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ return F.bbox_crop(
+ bbox, x_min=x_min, x_max=x_max, y_min=y_min, y_max=y_max, rows=params["rows"], cols=params["cols"]
+ )
+
+ def apply_to_keypoint(self, keypoint, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ return F.crop_keypoint_by_coords(keypoint, crop_coords=(x_min, y_min, x_max, y_max))
+
+ def _preprocess_mask(self, mask):
+ mask_height, mask_width = mask.shape[:2]
+
+ if self.ignore_values is not None:
+ ignore_values_np = np.array(self.ignore_values)
+ mask = np.where(np.isin(mask, ignore_values_np), 0, mask)
+
+ if mask.ndim == 3 and self.ignore_channels is not None:
+ target_channels = np.array([ch for ch in range(mask.shape[-1]) if ch not in self.ignore_channels])
+ mask = np.take(mask, target_channels, axis=-1)
+
+ if self.height > mask_height or self.width > mask_width:
+ raise ValueError(
+ "Crop size ({},{}) is larger than image ({},{})".format(
+ self.height, self.width, mask_height, mask_width
+ )
+ )
+
+ return mask
+
+ def update_params(self, params, **kwargs):
+ super().update_params(params, **kwargs)
+ if "mask" in kwargs:
+ mask = self._preprocess_mask(kwargs["mask"])
+ elif "masks" in kwargs and len(kwargs["masks"]):
+ masks = kwargs["masks"]
+ mask = self._preprocess_mask(np.copy(masks[0])) # need copy as we perform in-place mod afterwards
+ for m in masks[1:]:
+ mask |= self._preprocess_mask(m)
+ else:
+ raise RuntimeError("Can not find mask for CropNonEmptyMaskIfExists")
+
+ mask_height, mask_width = mask.shape[:2]
+
+ if mask.any():
+ mask = mask.sum(axis=-1) if mask.ndim == 3 else mask
+ non_zero_yx = np.argwhere(mask)
+ y, x = random.choice(non_zero_yx)
+ x_min = x - random.randint(0, self.width - 1)
+ y_min = y - random.randint(0, self.height - 1)
+ x_min = np.clip(x_min, 0, mask_width - self.width)
+ y_min = np.clip(y_min, 0, mask_height - self.height)
+ else:
+ x_min = random.randint(0, mask_width - self.width)
+ y_min = random.randint(0, mask_height - self.height)
+
+ x_max = x_min + self.width
+ y_max = y_min + self.height
+
+ params.update({"x_min": x_min, "x_max": x_max, "y_min": y_min, "y_max": y_max})
+ return params
+
+ def get_transform_init_args_names(self):
+ return ("height", "width", "ignore_values", "ignore_channels")
+
+
+class _BaseRandomSizedCrop(DualTransform):
+ # Base class for RandomSizedCrop and RandomResizedCrop
+
+ def __init__(self, height, width, interpolation=cv2.INTER_LINEAR, always_apply=False, p=1.0):
+ super(_BaseRandomSizedCrop, self).__init__(always_apply, p)
+ self.height = height
+ self.width = width
+ self.interpolation = interpolation
+
+ def apply(self, img, crop_height=0, crop_width=0, h_start=0, w_start=0, interpolation=cv2.INTER_LINEAR, **params):
+ crop = F.random_crop(img, crop_height, crop_width, h_start, w_start)
+ return FGeometric.resize(crop, self.height, self.width, interpolation)
+
+ def apply_to_bbox(self, bbox, crop_height=0, crop_width=0, h_start=0, w_start=0, rows=0, cols=0, **params):
+ return F.bbox_random_crop(bbox, crop_height, crop_width, h_start, w_start, rows, cols)
+
+ def apply_to_keypoint(self, keypoint, crop_height=0, crop_width=0, h_start=0, w_start=0, rows=0, cols=0, **params):
+ keypoint = F.keypoint_random_crop(keypoint, crop_height, crop_width, h_start, w_start, rows, cols)
+ scale_x = self.width / crop_width
+ scale_y = self.height / crop_height
+ keypoint = FGeometric.keypoint_scale(keypoint, scale_x, scale_y)
+ return keypoint
+
+
+class RandomSizedCrop(_BaseRandomSizedCrop):
+ """Crop a random part of the input and rescale it to some size.
+
+ Args:
+ min_max_height ((int, int)): crop size limits.
+ height (int): height after crop and resize.
+ width (int): width after crop and resize.
+ w2h_ratio (float): aspect ratio of crop.
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self, min_max_height, height, width, w2h_ratio=1.0, interpolation=cv2.INTER_LINEAR, always_apply=False, p=1.0
+ ):
+ super(RandomSizedCrop, self).__init__(
+ height=height, width=width, interpolation=interpolation, always_apply=always_apply, p=p
+ )
+ self.min_max_height = min_max_height
+ self.w2h_ratio = w2h_ratio
+
+ def get_params(self):
+ crop_height = random.randint(self.min_max_height[0], self.min_max_height[1])
+ return {
+ "h_start": random.random(),
+ "w_start": random.random(),
+ "crop_height": crop_height,
+ "crop_width": int(crop_height * self.w2h_ratio),
+ }
+
+ def get_transform_init_args_names(self):
+ return "min_max_height", "height", "width", "w2h_ratio", "interpolation"
+
+
+class RandomResizedCrop(_BaseRandomSizedCrop):
+ """Torchvision's variant of crop a random part of the input and rescale it to some size.
+
+ Args:
+ height (int): height after crop and resize.
+ width (int): width after crop and resize.
+ scale ((float, float)): range of size of the origin size cropped
+ ratio ((float, float)): range of aspect ratio of the origin aspect ratio cropped
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ height,
+ width,
+ scale=(0.08, 1.0),
+ ratio=(0.75, 1.3333333333333333),
+ interpolation=cv2.INTER_LINEAR,
+ always_apply=False,
+ p=1.0,
+ ):
+ super(RandomResizedCrop, self).__init__(
+ height=height, width=width, interpolation=interpolation, always_apply=always_apply, p=p
+ )
+ self.scale = scale
+ self.ratio = ratio
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ area = img.shape[0] * img.shape[1]
+
+ for _attempt in range(10):
+ target_area = random.uniform(*self.scale) * area
+ log_ratio = (math.log(self.ratio[0]), math.log(self.ratio[1]))
+ aspect_ratio = math.exp(random.uniform(*log_ratio))
+
+ w = int(round(math.sqrt(target_area * aspect_ratio))) # skipcq: PTC-W0028
+ h = int(round(math.sqrt(target_area / aspect_ratio))) # skipcq: PTC-W0028
+
+ if 0 < w <= img.shape[1] and 0 < h <= img.shape[0]:
+ i = random.randint(0, img.shape[0] - h)
+ j = random.randint(0, img.shape[1] - w)
+ return {
+ "crop_height": h,
+ "crop_width": w,
+ "h_start": i * 1.0 / (img.shape[0] - h + 1e-10),
+ "w_start": j * 1.0 / (img.shape[1] - w + 1e-10),
+ }
+
+ # Fallback to central crop
+ in_ratio = img.shape[1] / img.shape[0]
+ if in_ratio < min(self.ratio):
+ w = img.shape[1]
+ h = int(round(w / min(self.ratio)))
+ elif in_ratio > max(self.ratio):
+ h = img.shape[0]
+ w = int(round(h * max(self.ratio)))
+ else: # whole image
+ w = img.shape[1]
+ h = img.shape[0]
+ i = (img.shape[0] - h) // 2
+ j = (img.shape[1] - w) // 2
+ return {
+ "crop_height": h,
+ "crop_width": w,
+ "h_start": i * 1.0 / (img.shape[0] - h + 1e-10),
+ "w_start": j * 1.0 / (img.shape[1] - w + 1e-10),
+ }
+
+ def get_params(self):
+ return {}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return "height", "width", "scale", "ratio", "interpolation"
+
+
+class RandomCropNearBBox(DualTransform):
+ """Crop bbox from image with random shift by x,y coordinates
+
+ Args:
+ max_part_shift (float, (float, float)): Max shift in `height` and `width` dimensions relative
+ to `cropping_bbox` dimension.
+ If max_part_shift is a single float, the range will be (max_part_shift, max_part_shift).
+ Default (0.3, 0.3).
+ cropping_box_key (str): Additional target key for cropping box. Default `cropping_bbox`
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+
+ Examples:
+ >>> aug = Compose([RandomCropNearBBox(max_part_shift=(0.1, 0.5), cropping_box_key='test_box')],
+ >>> bbox_params=BboxParams("pascal_voc"))
+ >>> result = aug(image=image, bboxes=bboxes, test_box=[0, 5, 10, 20])
+
+ """
+
+ def __init__(
+ self,
+ max_part_shift: Union[float, Tuple[float, float]] = (0.3, 0.3),
+ cropping_box_key: str = "cropping_bbox",
+ always_apply: bool = False,
+ p: float = 1.0,
+ ):
+ super(RandomCropNearBBox, self).__init__(always_apply, p)
+ self.max_part_shift = to_tuple(max_part_shift, low=max_part_shift)
+ self.cropping_bbox_key = cropping_box_key
+
+ if min(self.max_part_shift) < 0 or max(self.max_part_shift) > 1:
+ raise ValueError("Invalid max_part_shift. Got: {}".format(max_part_shift))
+
+ def apply(
+ self, img: np.ndarray, x_min: int = 0, x_max: int = 0, y_min: int = 0, y_max: int = 0, **params
+ ) -> np.ndarray:
+ return F.clamping_crop(img, x_min, y_min, x_max, y_max)
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, int]:
+ bbox = params[self.cropping_bbox_key]
+ h_max_shift = round((bbox[3] - bbox[1]) * self.max_part_shift[0])
+ w_max_shift = round((bbox[2] - bbox[0]) * self.max_part_shift[1])
+
+ x_min = bbox[0] - random.randint(-w_max_shift, w_max_shift)
+ x_max = bbox[2] + random.randint(-w_max_shift, w_max_shift)
+
+ y_min = bbox[1] - random.randint(-h_max_shift, h_max_shift)
+ y_max = bbox[3] + random.randint(-h_max_shift, h_max_shift)
+
+ x_min = max(0, x_min)
+ y_min = max(0, y_min)
+
+ return {"x_min": x_min, "x_max": x_max, "y_min": y_min, "y_max": y_max}
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return F.bbox_crop(bbox, **params)
+
+ def apply_to_keypoint(
+ self,
+ keypoint: Tuple[float, float, float, float],
+ x_min: int = 0,
+ x_max: int = 0,
+ y_min: int = 0,
+ y_max: int = 0,
+ **params
+ ) -> Tuple[float, float, float, float]:
+ return F.crop_keypoint_by_coords(keypoint, crop_coords=(x_min, y_min, x_max, y_max))
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return [self.cropping_bbox_key]
+
+ def get_transform_init_args_names(self) -> Tuple[str]:
+ return ("max_part_shift",)
+
+
+class BBoxSafeRandomCrop(DualTransform):
+ """Crop a random part of the input without loss of bboxes.
+ Args:
+ erosion_rate (float): erosion rate applied on input image height before crop.
+ p (float): probability of applying the transform. Default: 1.
+ Targets:
+ image, mask, bboxes
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, erosion_rate=0.0, always_apply=False, p=1.0):
+ super(BBoxSafeRandomCrop, self).__init__(always_apply, p)
+ self.erosion_rate = erosion_rate
+
+ def apply(self, img, crop_height=0, crop_width=0, h_start=0, w_start=0, **params):
+ return F.random_crop(img, crop_height, crop_width, h_start, w_start)
+
+ def get_params_dependent_on_targets(self, params):
+ img_h, img_w = params["image"].shape[:2]
+ if len(params["bboxes"]) == 0: # less likely, this class is for use with bboxes.
+ erosive_h = int(img_h * (1.0 - self.erosion_rate))
+ crop_height = img_h if erosive_h >= img_h else random.randint(erosive_h, img_h)
+ return {
+ "h_start": random.random(),
+ "w_start": random.random(),
+ "crop_height": crop_height,
+ "crop_width": int(crop_height * img_w / img_h),
+ }
+ # get union of all bboxes
+ x, y, x2, y2 = union_of_bboxes(
+ width=img_w, height=img_h, bboxes=params["bboxes"], erosion_rate=self.erosion_rate
+ )
+ # find bigger region
+ bx, by = x * random.random(), y * random.random()
+ bx2, by2 = x2 + (1 - x2) * random.random(), y2 + (1 - y2) * random.random()
+ bw, bh = bx2 - bx, by2 - by
+ crop_height = img_h if bh >= 1.0 else int(img_h * bh)
+ crop_width = img_w if bw >= 1.0 else int(img_w * bw)
+ h_start = np.clip(0.0 if bh >= 1.0 else by / (1.0 - bh), 0.0, 1.0)
+ w_start = np.clip(0.0 if bw >= 1.0 else bx / (1.0 - bw), 0.0, 1.0)
+ return {"h_start": h_start, "w_start": w_start, "crop_height": crop_height, "crop_width": crop_width}
+
+ def apply_to_bbox(self, bbox, crop_height=0, crop_width=0, h_start=0, w_start=0, rows=0, cols=0, **params):
+ return F.bbox_random_crop(bbox, crop_height, crop_width, h_start, w_start, rows, cols)
+
+ @property
+ def targets_as_params(self):
+ return ["image", "bboxes"]
+
+ def get_transform_init_args_names(self):
+ return ("erosion_rate",)
+
+
+class RandomSizedBBoxSafeCrop(BBoxSafeRandomCrop):
+ """Crop a random part of the input and rescale it to some size without loss of bboxes.
+ Args:
+ height (int): height after crop and resize.
+ width (int): width after crop and resize.
+ erosion_rate (float): erosion rate applied on input image height before crop.
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 1.
+ Targets:
+ image, mask, bboxes
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, height, width, erosion_rate=0.0, interpolation=cv2.INTER_LINEAR, always_apply=False, p=1.0):
+ super(RandomSizedBBoxSafeCrop, self).__init__(erosion_rate, always_apply, p)
+ self.height = height
+ self.width = width
+ self.interpolation = interpolation
+
+ def apply(self, img, crop_height=0, crop_width=0, h_start=0, w_start=0, interpolation=cv2.INTER_LINEAR, **params):
+ crop = F.random_crop(img, crop_height, crop_width, h_start, w_start)
+ return FGeometric.resize(crop, self.height, self.width, interpolation)
+
+ def get_transform_init_args_names(self):
+ return super().get_transform_init_args_names() + ("height", "width", "interpolation")
+
+
+class CropAndPad(DualTransform):
+ """Crop and pad images by pixel amounts or fractions of image sizes.
+ Cropping removes pixels at the sides (i.e. extracts a subimage from a given full image).
+ Padding adds pixels to the sides (e.g. black pixels).
+ This transformation will never crop images below a height or width of ``1``.
+
+ Note:
+ This transformation automatically resizes images back to their original size. To deactivate this, add the
+ parameter ``keep_size=False``.
+
+ Args:
+ px (int or tuple):
+ The number of pixels to crop (negative values) or pad (positive values)
+ on each side of the image. Either this or the parameter `percent` may
+ be set, not both at the same time.
+ * If ``None``, then pixel-based cropping/padding will not be used.
+ * If ``int``, then that exact number of pixels will always be cropped/padded.
+ * If a ``tuple`` of two ``int`` s with values ``a`` and ``b``,
+ then each side will be cropped/padded by a random amount sampled
+ uniformly per image and side from the interval ``[a, b]``. If
+ however `sample_independently` is set to ``False``, only one
+ value will be sampled per image and used for all sides.
+ * If a ``tuple`` of four entries, then the entries represent top,
+ right, bottom, left. Each entry may be a single ``int`` (always
+ crop/pad by exactly that value), a ``tuple`` of two ``int`` s
+ ``a`` and ``b`` (crop/pad by an amount within ``[a, b]``), a
+ ``list`` of ``int`` s (crop/pad by a random value that is
+ contained in the ``list``).
+ percent (float or tuple):
+ The number of pixels to crop (negative values) or pad (positive values)
+ on each side of the image given as a *fraction* of the image
+ height/width. E.g. if this is set to ``-0.1``, the transformation will
+ always crop away ``10%`` of the image's height at both the top and the
+ bottom (both ``10%`` each), as well as ``10%`` of the width at the
+ right and left.
+ Expected value range is ``(-1.0, inf)``.
+ Either this or the parameter `px` may be set, not both
+ at the same time.
+ * If ``None``, then fraction-based cropping/padding will not be
+ used.
+ * If ``float``, then that fraction will always be cropped/padded.
+ * If a ``tuple`` of two ``float`` s with values ``a`` and ``b``,
+ then each side will be cropped/padded by a random fraction
+ sampled uniformly per image and side from the interval
+ ``[a, b]``. If however `sample_independently` is set to
+ ``False``, only one value will be sampled per image and used for
+ all sides.
+ * If a ``tuple`` of four entries, then the entries represent top,
+ right, bottom, left. Each entry may be a single ``float``
+ (always crop/pad by exactly that percent value), a ``tuple`` of
+ two ``float`` s ``a`` and ``b`` (crop/pad by a fraction from
+ ``[a, b]``), a ``list`` of ``float`` s (crop/pad by a random
+ value that is contained in the list).
+ pad_mode (int): OpenCV border mode.
+ pad_cval (number, Sequence[number]):
+ The constant value to use if the pad mode is ``BORDER_CONSTANT``.
+ * If ``number``, then that value will be used.
+ * If a ``tuple`` of two ``number`` s and at least one of them is
+ a ``float``, then a random number will be uniformly sampled per
+ image from the continuous interval ``[a, b]`` and used as the
+ value. If both ``number`` s are ``int`` s, the interval is
+ discrete.
+ * If a ``list`` of ``number``, then a random value will be chosen
+ from the elements of the ``list`` and used as the value.
+ pad_cval_mask (number, Sequence[number]): Same as pad_cval but only for masks.
+ keep_size (bool):
+ After cropping and padding, the result image will usually have a
+ different height/width compared to the original input image. If this
+ parameter is set to ``True``, then the cropped/padded image will be
+ resized to the input image's size, i.e. the output shape is always identical to the input shape.
+ sample_independently (bool):
+ If ``False`` *and* the values for `px`/`percent` result in exactly
+ *one* probability distribution for all image sides, only one single
+ value will be sampled from that probability distribution and used for
+ all sides. I.e. the crop/pad amount then is the same for all sides.
+ If ``True``, four values will be sampled independently, one per side.
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ any
+ """
+
+ def __init__(
+ self,
+ px: Optional[Union[int, Sequence[float], Sequence[Tuple]]] = None,
+ percent: Optional[Union[float, Sequence[float], Sequence[Tuple]]] = None,
+ pad_mode: int = cv2.BORDER_CONSTANT,
+ pad_cval: Union[float, Sequence[float]] = 0,
+ pad_cval_mask: Union[float, Sequence[float]] = 0,
+ keep_size: bool = True,
+ sample_independently: bool = True,
+ interpolation: int = cv2.INTER_LINEAR,
+ always_apply: bool = False,
+ p: float = 1.0,
+ ):
+ super().__init__(always_apply, p)
+
+ if px is None and percent is None:
+ raise ValueError("px and percent are empty!")
+ if px is not None and percent is not None:
+ raise ValueError("Only px or percent may be set!")
+
+ self.px = px
+ self.percent = percent
+
+ self.pad_mode = pad_mode
+ self.pad_cval = pad_cval
+ self.pad_cval_mask = pad_cval_mask
+
+ self.keep_size = keep_size
+ self.sample_independently = sample_independently
+
+ self.interpolation = interpolation
+
+ def apply(
+ self,
+ img: np.ndarray,
+ crop_params: Sequence[int] = (),
+ pad_params: Sequence[int] = (),
+ pad_value: Union[int, float] = 0,
+ rows: int = 0,
+ cols: int = 0,
+ interpolation: int = cv2.INTER_LINEAR,
+ **params
+ ) -> np.ndarray:
+ return F.crop_and_pad(
+ img, crop_params, pad_params, pad_value, rows, cols, interpolation, self.pad_mode, self.keep_size
+ )
+
+ def apply_to_mask(
+ self,
+ img: np.ndarray,
+ crop_params: Optional[Sequence[int]] = None,
+ pad_params: Optional[Sequence[int]] = None,
+ pad_value_mask: Optional[float] = None,
+ rows: int = 0,
+ cols: int = 0,
+ interpolation: int = cv2.INTER_NEAREST,
+ **params
+ ) -> np.ndarray:
+ return F.crop_and_pad(
+ img, crop_params, pad_params, pad_value_mask, rows, cols, interpolation, self.pad_mode, self.keep_size
+ )
+
+ def apply_to_bbox(
+ self,
+ bbox: BoxInternalType,
+ crop_params: Optional[Sequence[int]] = None,
+ pad_params: Optional[Sequence[int]] = None,
+ rows: int = 0,
+ cols: int = 0,
+ result_rows: int = 0,
+ result_cols: int = 0,
+ **params
+ ) -> BoxInternalType:
+ return F.crop_and_pad_bbox(bbox, crop_params, pad_params, rows, cols, result_rows, result_cols)
+
+ def apply_to_keypoint(
+ self,
+ keypoint: KeypointInternalType,
+ crop_params: Optional[Sequence[int]] = None,
+ pad_params: Optional[Sequence[int]] = None,
+ rows: int = 0,
+ cols: int = 0,
+ result_rows: int = 0,
+ result_cols: int = 0,
+ **params
+ ) -> KeypointInternalType:
+ return F.crop_and_pad_keypoint(
+ keypoint, crop_params, pad_params, rows, cols, result_rows, result_cols, self.keep_size
+ )
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return ["image"]
+
+ @staticmethod
+ def __prevent_zero(val1: int, val2: int, max_val: int) -> Tuple[int, int]:
+ regain = abs(max_val) + 1
+ regain1 = regain // 2
+ regain2 = regain // 2
+ if regain1 + regain2 < regain:
+ regain1 += 1
+
+ if regain1 > val1:
+ diff = regain1 - val1
+ regain1 = val1
+ regain2 += diff
+ elif regain2 > val2:
+ diff = regain2 - val2
+ regain2 = val2
+ regain1 += diff
+
+ val1 = val1 - regain1
+ val2 = val2 - regain2
+
+ return val1, val2
+
+ @staticmethod
+ def _prevent_zero(crop_params: List[int], height: int, width: int) -> Sequence[int]:
+ top, right, bottom, left = crop_params
+
+ remaining_height = height - (top + bottom)
+ remaining_width = width - (left + right)
+
+ if remaining_height < 1:
+ top, bottom = CropAndPad.__prevent_zero(top, bottom, height)
+ if remaining_width < 1:
+ left, right = CropAndPad.__prevent_zero(left, right, width)
+
+ return [max(top, 0), max(right, 0), max(bottom, 0), max(left, 0)]
+
+ def get_params_dependent_on_targets(self, params) -> dict:
+ height, width = params["image"].shape[:2]
+
+ if self.px is not None:
+ params = self._get_px_params()
+ else:
+ params = self._get_percent_params()
+ params[0] = int(params[0] * height)
+ params[1] = int(params[1] * width)
+ params[2] = int(params[2] * height)
+ params[3] = int(params[3] * width)
+
+ pad_params = [max(i, 0) for i in params]
+
+ crop_params = self._prevent_zero([-min(i, 0) for i in params], height, width)
+
+ top, right, bottom, left = crop_params
+ crop_params = [left, top, width - right, height - bottom]
+ result_rows = crop_params[3] - crop_params[1]
+ result_cols = crop_params[2] - crop_params[0]
+ if result_cols == width and result_rows == height:
+ crop_params = []
+
+ top, right, bottom, left = pad_params
+ pad_params = [top, bottom, left, right]
+ if any(pad_params):
+ result_rows += top + bottom
+ result_cols += left + right
+ else:
+ pad_params = []
+
+ return {
+ "crop_params": crop_params or None,
+ "pad_params": pad_params or None,
+ "pad_value": None if pad_params is None else self._get_pad_value(self.pad_cval),
+ "pad_value_mask": None if pad_params is None else self._get_pad_value(self.pad_cval_mask),
+ "result_rows": result_rows,
+ "result_cols": result_cols,
+ }
+
+ def _get_px_params(self) -> List[int]:
+ if self.px is None:
+ raise ValueError("px is not set")
+
+ if isinstance(self.px, int):
+ params = [self.px] * 4
+ elif len(self.px) == 2:
+ if self.sample_independently:
+ params = [random.randrange(*self.px) for _ in range(4)]
+ else:
+ px = random.randrange(*self.px)
+ params = [px] * 4
+ else:
+ params = [i if isinstance(i, int) else random.randrange(*i) for i in self.px] # type: ignore
+
+ return params # [top, right, bottom, left]
+
+ def _get_percent_params(self) -> List[float]:
+ if self.percent is None:
+ raise ValueError("percent is not set")
+
+ if isinstance(self.percent, float):
+ params = [self.percent] * 4
+ elif len(self.percent) == 2:
+ if self.sample_independently:
+ params = [random.uniform(*self.percent) for _ in range(4)]
+ else:
+ px = random.uniform(*self.percent)
+ params = [px] * 4
+ else:
+ params = [i if isinstance(i, (int, float)) else random.uniform(*i) for i in self.percent]
+
+ return params # params = [top, right, bottom, left]
+
+ @staticmethod
+ def _get_pad_value(pad_value: Union[float, Sequence[float]]) -> Union[int, float]:
+ if isinstance(pad_value, (int, float)):
+ return pad_value
+
+ if len(pad_value) == 2:
+ a, b = pad_value
+ if isinstance(a, int) and isinstance(b, int):
+ return random.randint(a, b)
+
+ return random.uniform(a, b)
+
+ return random.choice(pad_value)
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return (
+ "px",
+ "percent",
+ "pad_mode",
+ "pad_cval",
+ "pad_cval_mask",
+ "keep_size",
+ "sample_independently",
+ "interpolation",
+ )
+
+
+class RandomCropFromBorders(DualTransform):
+ """Crop bbox from image randomly cut parts from borders without resize at the end
+
+ Args:
+ crop_left (float): single float value in (0.0, 1.0) range. Default 0.1. Image will be randomly cut
+ from left side in range [0, crop_left * width)
+ crop_right (float): single float value in (0.0, 1.0) range. Default 0.1. Image will be randomly cut
+ from right side in range [(1 - crop_right) * width, width)
+ crop_top (float): singlefloat value in (0.0, 1.0) range. Default 0.1. Image will be randomly cut
+ from top side in range [0, crop_top * height)
+ crop_bottom (float): single float value in (0.0, 1.0) range. Default 0.1. Image will be randomly cut
+ from bottom side in range [(1 - crop_bottom) * height, height)
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ crop_left=0.1,
+ crop_right=0.1,
+ crop_top=0.1,
+ crop_bottom=0.1,
+ always_apply=False,
+ p=1.0,
+ ):
+ super(RandomCropFromBorders, self).__init__(always_apply, p)
+ self.crop_left = crop_left
+ self.crop_right = crop_right
+ self.crop_top = crop_top
+ self.crop_bottom = crop_bottom
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ x_min = random.randint(0, int(self.crop_left * img.shape[1]))
+ x_max = random.randint(max(x_min + 1, int((1 - self.crop_right) * img.shape[1])), img.shape[1])
+ y_min = random.randint(0, int(self.crop_top * img.shape[0]))
+ y_max = random.randint(max(y_min + 1, int((1 - self.crop_bottom) * img.shape[0])), img.shape[0])
+ return {"x_min": x_min, "x_max": x_max, "y_min": y_min, "y_max": y_max}
+
+ def apply(self, img, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ return F.clamping_crop(img, x_min, y_min, x_max, y_max)
+
+ def apply_to_mask(self, mask, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ return F.clamping_crop(mask, x_min, y_min, x_max, y_max)
+
+ def apply_to_bbox(self, bbox, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ rows, cols = params["rows"], params["cols"]
+ return F.bbox_crop(bbox, x_min, y_min, x_max, y_max, rows, cols)
+
+ def apply_to_keypoint(self, keypoint, x_min=0, x_max=0, y_min=0, y_max=0, **params):
+ return F.crop_keypoint_by_coords(keypoint, crop_coords=(x_min, y_min, x_max, y_max))
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return "crop_left", "crop_right", "crop_top", "crop_bottom"
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/domain_adaptation.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/domain_adaptation.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdde7d3bcec83830cc1135a8a6e9babe4e08b8ab
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/domain_adaptation.py
@@ -0,0 +1,337 @@
+import random
+from typing import Any, Callable, Literal, Sequence, Tuple
+
+import cv2
+import numpy as np
+from custom_qudida import DomainAdapter
+from skimage.exposure import match_histograms
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+
+from custom_albumentations.augmentations.utils import (
+ clipped,
+ get_opencv_dtype_from_numpy,
+ is_grayscale_image,
+ is_multispectral_image,
+ preserve_shape,
+ read_rgb_image,
+)
+
+from ..core.transforms_interface import ImageOnlyTransform, ScaleFloatType, to_tuple
+
+__all__ = [
+ "HistogramMatching",
+ "FDA",
+ "PixelDistributionAdaptation",
+ "fourier_domain_adaptation",
+ "apply_histogram",
+ "adapt_pixel_distribution",
+]
+
+
+@clipped
+@preserve_shape
+def fourier_domain_adaptation(img: np.ndarray, target_img: np.ndarray, beta: float) -> np.ndarray:
+ """
+ Fourier Domain Adaptation from https://github.com/YanchaoYang/FDA
+
+ Args:
+ img: source image
+ target_img: target image for domain adaptation
+ beta: coefficient from source paper
+
+ Returns:
+ transformed image
+
+ """
+
+ img = np.squeeze(img)
+ target_img = np.squeeze(target_img)
+
+ if target_img.shape != img.shape:
+ raise ValueError(
+ "The source and target images must have the same shape,"
+ " but got {} and {} respectively.".format(img.shape, target_img.shape)
+ )
+
+ # get fft of both source and target
+ fft_src = np.fft.fft2(img.astype(np.float32), axes=(0, 1))
+ fft_trg = np.fft.fft2(target_img.astype(np.float32), axes=(0, 1))
+
+ # extract amplitude and phase of both fft-s
+ amplitude_src, phase_src = np.abs(fft_src), np.angle(fft_src)
+ amplitude_trg = np.abs(fft_trg)
+
+ # mutate the amplitude part of source with target
+ amplitude_src = np.fft.fftshift(amplitude_src, axes=(0, 1))
+ amplitude_trg = np.fft.fftshift(amplitude_trg, axes=(0, 1))
+ height, width = amplitude_src.shape[:2]
+ border = np.floor(min(height, width) * beta).astype(int)
+ center_y, center_x = np.floor([height / 2.0, width / 2.0]).astype(int)
+
+ y1, y2 = center_y - border, center_y + border + 1
+ x1, x2 = center_x - border, center_x + border + 1
+
+ amplitude_src[y1:y2, x1:x2] = amplitude_trg[y1:y2, x1:x2]
+ amplitude_src = np.fft.ifftshift(amplitude_src, axes=(0, 1))
+
+ # get mutated image
+ src_image_transformed = np.fft.ifft2(amplitude_src * np.exp(1j * phase_src), axes=(0, 1))
+ src_image_transformed = np.real(src_image_transformed)
+
+ return src_image_transformed
+
+
+@preserve_shape
+def apply_histogram(img: np.ndarray, reference_image: np.ndarray, blend_ratio: float) -> np.ndarray:
+ if img.dtype != reference_image.dtype:
+ raise RuntimeError(
+ f"Dtype of image and reference image must be the same. Got {img.dtype} and {reference_image.dtype}"
+ )
+ if img.shape[:2] != reference_image.shape[:2]:
+ reference_image = cv2.resize(reference_image, dsize=(img.shape[1], img.shape[0]))
+
+ img, reference_image = np.squeeze(img), np.squeeze(reference_image)
+
+ try:
+ matched = match_histograms(img, reference_image, channel_axis=2 if len(img.shape) == 3 else None)
+ except TypeError:
+ matched = match_histograms(img, reference_image, multichannel=True) # case for scikit-image<0.19.1
+ img = cv2.addWeighted(
+ matched,
+ blend_ratio,
+ img,
+ 1 - blend_ratio,
+ 0,
+ dtype=get_opencv_dtype_from_numpy(img.dtype),
+ )
+ return img
+
+
+@preserve_shape
+def adapt_pixel_distribution(
+ img: np.ndarray, ref: np.ndarray, transform_type: str = "pca", weight: float = 0.5
+) -> np.ndarray:
+ initial_type = img.dtype
+ transformer = {"pca": PCA, "standard": StandardScaler, "minmax": MinMaxScaler}[transform_type]()
+ adapter = DomainAdapter(transformer=transformer, ref_img=ref)
+ result = adapter(img).astype("float32")
+ blended = (img.astype("float32") * (1 - weight) + result * weight).astype(initial_type)
+ return blended
+
+
+class HistogramMatching(ImageOnlyTransform):
+ """
+ Apply histogram matching. It manipulates the pixels of an input image so that its histogram matches
+ the histogram of the reference image. If the images have multiple channels, the matching is done independently
+ for each channel, as long as the number of channels is equal in the input image and the reference.
+
+ Histogram matching can be used as a lightweight normalisation for image processing,
+ such as feature matching, especially in circumstances where the images have been taken from different
+ sources or in different conditions (i.e. lighting).
+
+ See:
+ https://scikit-image.org/docs/dev/auto_examples/color_exposure/plot_histogram_matching.html
+
+ Args:
+ reference_images (Sequence[Any]): Sequence of objects that will be converted to images by `read_fn`. By default,
+ it expects a sequence of paths to images.
+ blend_ratio (float, float): Tuple of min and max blend ratio. Matched image will be blended with original
+ with random blend factor for increased diversity of generated images.
+ read_fn (Callable): Used-defined function to read image. Function should get an element of `reference_images`
+ and return numpy array of image pixels. Default: takes as input a path to an image and returns a numpy array.
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, uint16, float32
+ """
+
+ def __init__(
+ self,
+ reference_images: Sequence[Any],
+ blend_ratio: Tuple[float, float] = (0.5, 1.0),
+ read_fn: Callable[[Any], np.ndarray] = read_rgb_image,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply=always_apply, p=p)
+ self.reference_images = reference_images
+ self.read_fn = read_fn
+ self.blend_ratio = blend_ratio
+
+ def apply(self, img, reference_image=None, blend_ratio=0.5, **params):
+ return apply_histogram(img, reference_image, blend_ratio)
+
+ def get_params(self):
+ return {
+ "reference_image": self.read_fn(random.choice(self.reference_images)),
+ "blend_ratio": random.uniform(self.blend_ratio[0], self.blend_ratio[1]),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("reference_images", "blend_ratio", "read_fn")
+
+ def _to_dict(self):
+ raise NotImplementedError("HistogramMatching can not be serialized.")
+
+
+class FDA(ImageOnlyTransform):
+ """
+ Fourier Domain Adaptation from https://github.com/YanchaoYang/FDA
+ Simple "style transfer".
+
+ Args:
+ reference_images (Sequence[Any]): Sequence of objects that will be converted to images by `read_fn`. By default,
+ it expects a sequence of paths to images.
+ beta_limit (float or tuple of float): coefficient beta from paper. Recommended less 0.3.
+ read_fn (Callable): Used-defined function to read image. Function should get an element of `reference_images`
+ and return numpy array of image pixels. Default: takes as input a path to an image and returns a numpy array.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+
+ Reference:
+ https://github.com/YanchaoYang/FDA
+ https://openaccess.thecvf.com/content_CVPR_2020/papers/Yang_FDA_Fourier_Domain_Adaptation_for_Semantic_Segmentation_CVPR_2020_paper.pdf
+
+ Example:
+ >>> import numpy as np
+ >>> import custom_albumentations as albumentations as A
+ >>> image = np.random.randint(0, 256, [100, 100, 3], dtype=np.uint8)
+ >>> target_image = np.random.randint(0, 256, [100, 100, 3], dtype=np.uint8)
+ >>> aug = A.Compose([A.FDA([target_image], p=1, read_fn=lambda x: x)])
+ >>> result = aug(image=image)
+
+ """
+
+ def __init__(
+ self,
+ reference_images: Sequence[Any],
+ beta_limit: ScaleFloatType = 0.1,
+ read_fn: Callable[[Any], np.ndarray] = read_rgb_image,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(FDA, self).__init__(always_apply=always_apply, p=p)
+ self.reference_images = reference_images
+ self.read_fn = read_fn
+ self.beta_limit = to_tuple(beta_limit, low=0)
+
+ def apply(self, img, target_image=None, beta=0.1, **params):
+ return fourier_domain_adaptation(img=img, target_img=target_image, beta=beta)
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ target_img = self.read_fn(random.choice(self.reference_images))
+ target_img = cv2.resize(target_img, dsize=(img.shape[1], img.shape[0]))
+
+ return {"target_image": target_img}
+
+ def get_params(self):
+ return {"beta": random.uniform(self.beta_limit[0], self.beta_limit[1])}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return ("reference_images", "beta_limit", "read_fn")
+
+ def _to_dict(self):
+ raise NotImplementedError("FDA can not be serialized.")
+
+
+class PixelDistributionAdaptation(ImageOnlyTransform):
+ """
+ Another naive and quick pixel-level domain adaptation. It fits a simple transform (such as PCA, StandardScaler
+ or MinMaxScaler) on both original and reference image, transforms original image with transform trained on this
+ image and then performs inverse transformation using transform fitted on reference image.
+
+ Args:
+ reference_images (Sequence[Any]): Sequence of objects that will be converted to images by `read_fn`. By default,
+ it expects a sequence of paths to images.
+ blend_ratio (float, float): Tuple of min and max blend ratio. Matched image will be blended with original
+ with random blend factor for increased diversity of generated images.
+ read_fn (Callable): Used-defined function to read image. Function should get an element of `reference_images`
+ and return numpy array of image pixels. Default: takes as input a path to an image and returns a numpy array.
+ transform_type (str): type of transform; "pca", "standard", "minmax" are allowed.
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+
+ See also: https://github.com/arsenyinfo/qudida
+ """
+
+ def __init__(
+ self,
+ reference_images: Sequence[Any],
+ blend_ratio: Tuple[float, float] = (0.25, 1.0),
+ read_fn: Callable[[Any], np.ndarray] = read_rgb_image,
+ transform_type: Literal["pca", "standard", "minmax"] = "pca",
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply=always_apply, p=p)
+ self.reference_images = reference_images
+ self.read_fn = read_fn
+ self.blend_ratio = blend_ratio
+ expected_transformers = ("pca", "standard", "minmax")
+ if transform_type not in expected_transformers:
+ raise ValueError(f"Got unexpected transform_type {transform_type}. Expected one of {expected_transformers}")
+ self.transform_type = transform_type
+
+ @staticmethod
+ def _validate_shape(img: np.ndarray):
+ if is_grayscale_image(img) or is_multispectral_image(img):
+ raise ValueError(
+ f"Unexpected image shape: expected 3 dimensions, got {len(img.shape)}."
+ f"Is it a grayscale or multispectral image? It's not supported for now."
+ )
+
+ def ensure_uint8(self, img: np.ndarray) -> Tuple[np.ndarray, bool]:
+ if img.dtype == np.float32:
+ if img.min() < 0 or img.max() > 1:
+ message = (
+ "PixelDistributionAdaptation uses uint8 under the hood, so float32 should be converted,"
+ "Can not do it automatically when the image is out of [0..1] range."
+ )
+ raise TypeError(message)
+ return (img * 255).astype("uint8"), True
+ return img, False
+
+ def apply(self, img, reference_image, blend_ratio, **params):
+ self._validate_shape(img)
+ reference_image, _ = self.ensure_uint8(reference_image)
+ img, needs_reconvert = self.ensure_uint8(img)
+
+ adapted = adapt_pixel_distribution(
+ img=img,
+ ref=reference_image,
+ weight=blend_ratio,
+ transform_type=self.transform_type,
+ )
+ if needs_reconvert:
+ adapted = adapted.astype("float32") * (1 / 255)
+ return adapted
+
+ def get_params(self):
+ return {
+ "reference_image": self.read_fn(random.choice(self.reference_images)),
+ "blend_ratio": random.uniform(self.blend_ratio[0], self.blend_ratio[1]),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("reference_images", "blend_ratio", "read_fn", "transform_type")
+
+ def _to_dict(self):
+ raise NotImplementedError("PixelDistributionAdaptation can not be serialized.")
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a9725f2ca36fb3cc9dd6a9e6d8a53d5c3759d8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/__init__.py
@@ -0,0 +1,5 @@
+from .channel_dropout import *
+from .coarse_dropout import *
+from .cutout import *
+from .grid_dropout import *
+from .mask_dropout import *
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/channel_dropout.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/channel_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..98e439747f8437409dfe2dcf21510f936fd2c5b9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/channel_dropout.py
@@ -0,0 +1,72 @@
+import random
+from typing import Any, Mapping, Tuple, Union
+
+import numpy as np
+
+from custom_albumentations.core.transforms_interface import ImageOnlyTransform
+
+from .functional import channel_dropout
+
+__all__ = ["ChannelDropout"]
+
+
+class ChannelDropout(ImageOnlyTransform):
+ """Randomly Drop Channels in the input Image.
+
+ Args:
+ channel_drop_range (int, int): range from which we choose the number of channels to drop.
+ fill_value (int, float): pixel value for the dropped channel.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, uint16, unit32, float32
+ """
+
+ def __init__(
+ self,
+ channel_drop_range: Tuple[int, int] = (1, 1),
+ fill_value: Union[int, float] = 0,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(ChannelDropout, self).__init__(always_apply, p)
+
+ self.channel_drop_range = channel_drop_range
+
+ self.min_channels = channel_drop_range[0]
+ self.max_channels = channel_drop_range[1]
+
+ if not 1 <= self.min_channels <= self.max_channels:
+ raise ValueError("Invalid channel_drop_range. Got: {}".format(channel_drop_range))
+
+ self.fill_value = fill_value
+
+ def apply(self, img: np.ndarray, channels_to_drop: Tuple[int, ...] = (0,), **params) -> np.ndarray:
+ return channel_dropout(img, channels_to_drop, self.fill_value)
+
+ def get_params_dependent_on_targets(self, params: Mapping[str, Any]):
+ img = params["image"]
+
+ num_channels = img.shape[-1]
+
+ if len(img.shape) == 2 or num_channels == 1:
+ raise NotImplementedError("Images has one channel. ChannelDropout is not defined.")
+
+ if self.max_channels >= num_channels:
+ raise ValueError("Can not drop all channels in ChannelDropout.")
+
+ num_drop_channels = random.randint(self.min_channels, self.max_channels)
+
+ channels_to_drop = random.sample(range(num_channels), k=num_drop_channels)
+
+ return {"channels_to_drop": channels_to_drop}
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return "channel_drop_range", "fill_value"
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/coarse_dropout.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/coarse_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..a01a95e0aad8682400e9e7963d760c10de19382e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/coarse_dropout.py
@@ -0,0 +1,187 @@
+import random
+from typing import Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+
+from ...core.transforms_interface import DualTransform, KeypointType
+from .functional import cutout
+
+__all__ = ["CoarseDropout"]
+
+
+class CoarseDropout(DualTransform):
+ """CoarseDropout of the rectangular regions in the image.
+
+ Args:
+ max_holes (int): Maximum number of regions to zero out.
+ max_height (int, float): Maximum height of the hole.
+ If float, it is calculated as a fraction of the image height.
+ max_width (int, float): Maximum width of the hole.
+ If float, it is calculated as a fraction of the image width.
+ min_holes (int): Minimum number of regions to zero out. If `None`,
+ `min_holes` is be set to `max_holes`. Default: `None`.
+ min_height (int, float): Minimum height of the hole. Default: None. If `None`,
+ `min_height` is set to `max_height`. Default: `None`.
+ If float, it is calculated as a fraction of the image height.
+ min_width (int, float): Minimum width of the hole. If `None`, `min_height` is
+ set to `max_width`. Default: `None`.
+ If float, it is calculated as a fraction of the image width.
+
+ fill_value (int, float, list of int, list of float): value for dropped pixels.
+ mask_fill_value (int, float, list of int, list of float): fill value for dropped pixels
+ in mask. If `None` - mask is not affected. Default: `None`.
+
+ Targets:
+ image, mask, keypoints
+
+ Image types:
+ uint8, float32
+
+ Reference:
+ | https://arxiv.org/abs/1708.04552
+ | https://github.com/uoguelph-mlrg/Cutout/blob/master/util/cutout.py
+ | https://github.com/aleju/imgaug/blob/master/imgaug/augmenters/arithmetic.py
+ """
+
+ def __init__(
+ self,
+ max_holes: int = 8,
+ max_height: int = 8,
+ max_width: int = 8,
+ min_holes: Optional[int] = None,
+ min_height: Optional[int] = None,
+ min_width: Optional[int] = None,
+ fill_value: int = 0,
+ mask_fill_value: Optional[int] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(CoarseDropout, self).__init__(always_apply, p)
+ self.max_holes = max_holes
+ self.max_height = max_height
+ self.max_width = max_width
+ self.min_holes = min_holes if min_holes is not None else max_holes
+ self.min_height = min_height if min_height is not None else max_height
+ self.min_width = min_width if min_width is not None else max_width
+ self.fill_value = fill_value
+ self.mask_fill_value = mask_fill_value
+ if not 0 < self.min_holes <= self.max_holes:
+ raise ValueError("Invalid combination of min_holes and max_holes. Got: {}".format([min_holes, max_holes]))
+
+ self.check_range(self.max_height)
+ self.check_range(self.min_height)
+ self.check_range(self.max_width)
+ self.check_range(self.min_width)
+
+ if not 0 < self.min_height <= self.max_height:
+ raise ValueError(
+ "Invalid combination of min_height and max_height. Got: {}".format([min_height, max_height])
+ )
+ if not 0 < self.min_width <= self.max_width:
+ raise ValueError("Invalid combination of min_width and max_width. Got: {}".format([min_width, max_width]))
+
+ def check_range(self, dimension):
+ if isinstance(dimension, float) and not 0 <= dimension < 1.0:
+ raise ValueError(
+ "Invalid value {}. If using floats, the value should be in the range [0.0, 1.0)".format(dimension)
+ )
+
+ def apply(
+ self,
+ img: np.ndarray,
+ fill_value: Union[int, float] = 0,
+ holes: Iterable[Tuple[int, int, int, int]] = (),
+ **params
+ ) -> np.ndarray:
+ return cutout(img, holes, fill_value)
+
+ def apply_to_mask(
+ self,
+ img: np.ndarray,
+ mask_fill_value: Union[int, float] = 0,
+ holes: Iterable[Tuple[int, int, int, int]] = (),
+ **params
+ ) -> np.ndarray:
+ if mask_fill_value is None:
+ return img
+ return cutout(img, holes, mask_fill_value)
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ height, width = img.shape[:2]
+
+ holes = []
+ for _n in range(random.randint(self.min_holes, self.max_holes)):
+ if all(
+ [
+ isinstance(self.min_height, int),
+ isinstance(self.min_width, int),
+ isinstance(self.max_height, int),
+ isinstance(self.max_width, int),
+ ]
+ ):
+ hole_height = random.randint(self.min_height, self.max_height)
+ hole_width = random.randint(self.min_width, self.max_width)
+ elif all(
+ [
+ isinstance(self.min_height, float),
+ isinstance(self.min_width, float),
+ isinstance(self.max_height, float),
+ isinstance(self.max_width, float),
+ ]
+ ):
+ hole_height = int(height * random.uniform(self.min_height, self.max_height))
+ hole_width = int(width * random.uniform(self.min_width, self.max_width))
+ else:
+ raise ValueError(
+ "Min width, max width, \
+ min height and max height \
+ should all either be ints or floats. \
+ Got: {} respectively".format(
+ [
+ type(self.min_width),
+ type(self.max_width),
+ type(self.min_height),
+ type(self.max_height),
+ ]
+ )
+ )
+
+ y1 = random.randint(0, height - hole_height)
+ x1 = random.randint(0, width - hole_width)
+ y2 = y1 + hole_height
+ x2 = x1 + hole_width
+ holes.append((x1, y1, x2, y2))
+
+ return {"holes": holes}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def _keypoint_in_hole(self, keypoint: KeypointType, hole: Tuple[int, int, int, int]) -> bool:
+ x1, y1, x2, y2 = hole
+ x, y = keypoint[:2]
+ return x1 <= x < x2 and y1 <= y < y2
+
+ def apply_to_keypoints(
+ self, keypoints: Sequence[KeypointType], holes: Iterable[Tuple[int, int, int, int]] = (), **params
+ ) -> List[KeypointType]:
+ result = set(keypoints)
+ for hole in holes:
+ for kp in keypoints:
+ if self._keypoint_in_hole(kp, hole):
+ result.discard(kp)
+ return list(result)
+
+ def get_transform_init_args_names(self):
+ return (
+ "max_holes",
+ "max_height",
+ "max_width",
+ "min_holes",
+ "min_height",
+ "min_width",
+ "fill_value",
+ "mask_fill_value",
+ )
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/cutout.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/cutout.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea4ae0854ee5cf5c573f4dc5d7f5f9158a073c66
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/cutout.py
@@ -0,0 +1,79 @@
+import random
+import warnings
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+
+from custom_albumentations.core.transforms_interface import ImageOnlyTransform
+
+from .functional import cutout
+
+__all__ = ["Cutout"]
+
+
+class Cutout(ImageOnlyTransform):
+ """CoarseDropout of the square regions in the image.
+
+ Args:
+ num_holes (int): number of regions to zero out
+ max_h_size (int): maximum height of the hole
+ max_w_size (int): maximum width of the hole
+ fill_value (int, float, list of int, list of float): value for dropped pixels.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+
+ Reference:
+ | https://arxiv.org/abs/1708.04552
+ | https://github.com/uoguelph-mlrg/Cutout/blob/master/util/cutout.py
+ | https://github.com/aleju/imgaug/blob/master/imgaug/augmenters/arithmetic.py
+ """
+
+ def __init__(
+ self,
+ num_holes: int = 8,
+ max_h_size: int = 8,
+ max_w_size: int = 8,
+ fill_value: Union[int, float] = 0,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(Cutout, self).__init__(always_apply, p)
+ self.num_holes = num_holes
+ self.max_h_size = max_h_size
+ self.max_w_size = max_w_size
+ self.fill_value = fill_value
+ warnings.warn(
+ f"{self.__class__.__name__} has been deprecated. Please use CoarseDropout",
+ FutureWarning,
+ )
+
+ def apply(self, img: np.ndarray, fill_value: Union[int, float] = 0, holes=(), **params):
+ return cutout(img, holes, fill_value)
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, Any]:
+ img = params["image"]
+ height, width = img.shape[:2]
+
+ holes = []
+ for _n in range(self.num_holes):
+ y = random.randint(0, height)
+ x = random.randint(0, width)
+
+ y1 = np.clip(y - self.max_h_size // 2, 0, height)
+ y2 = np.clip(y1 + self.max_h_size, 0, height)
+ x1 = np.clip(x - self.max_w_size // 2, 0, width)
+ x2 = np.clip(x1 + self.max_w_size, 0, width)
+ holes.append((x1, y1, x2, y2))
+
+ return {"holes": holes}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return ("num_holes", "max_h_size", "max_w_size")
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/functional.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..e485fa3d9d478dc6e39b5b66915b30e5b3682304
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/functional.py
@@ -0,0 +1,29 @@
+from typing import Iterable, List, Tuple, Union
+
+import numpy as np
+
+from custom_albumentations.augmentations.utils import preserve_shape
+
+__all__ = ["cutout", "channel_dropout"]
+
+
+@preserve_shape
+def channel_dropout(
+ img: np.ndarray, channels_to_drop: Union[int, Tuple[int, ...], np.ndarray], fill_value: Union[int, float] = 0
+) -> np.ndarray:
+ if len(img.shape) == 2 or img.shape[2] == 1:
+ raise NotImplementedError("Only one channel. ChannelDropout is not defined.")
+
+ img = img.copy()
+ img[..., channels_to_drop] = fill_value
+ return img
+
+
+def cutout(
+ img: np.ndarray, holes: Iterable[Tuple[int, int, int, int]], fill_value: Union[int, float] = 0
+) -> np.ndarray:
+ # Make a copy of the input image since we don't want to modify it directly
+ img = img.copy()
+ for x1, y1, x2, y2 in holes:
+ img[y1:y2, x1:x2] = fill_value
+ return img
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/grid_dropout.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/grid_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab21038d73d313729e6028c8d0374c4616d7a271
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/grid_dropout.py
@@ -0,0 +1,155 @@
+import random
+from typing import Iterable, Optional, Tuple
+
+import numpy as np
+
+from ...core.transforms_interface import DualTransform
+from . import functional as F
+
+__all__ = ["GridDropout"]
+
+
+class GridDropout(DualTransform):
+ """GridDropout, drops out rectangular regions of an image and the corresponding mask in a grid fashion.
+
+ Args:
+ ratio (float): the ratio of the mask holes to the unit_size (same for horizontal and vertical directions).
+ Must be between 0 and 1. Default: 0.5.
+ unit_size_min (int): minimum size of the grid unit. Must be between 2 and the image shorter edge.
+ If 'None', holes_number_x and holes_number_y are used to setup the grid. Default: `None`.
+ unit_size_max (int): maximum size of the grid unit. Must be between 2 and the image shorter edge.
+ If 'None', holes_number_x and holes_number_y are used to setup the grid. Default: `None`.
+ holes_number_x (int): the number of grid units in x direction. Must be between 1 and image width//2.
+ If 'None', grid unit width is set as image_width//10. Default: `None`.
+ holes_number_y (int): the number of grid units in y direction. Must be between 1 and image height//2.
+ If `None`, grid unit height is set equal to the grid unit width or image height, whatever is smaller.
+ shift_x (int): offsets of the grid start in x direction from (0,0) coordinate.
+ Clipped between 0 and grid unit_width - hole_width. Default: 0.
+ shift_y (int): offsets of the grid start in y direction from (0,0) coordinate.
+ Clipped between 0 and grid unit height - hole_height. Default: 0.
+ random_offset (boolean): weather to offset the grid randomly between 0 and grid unit size - hole size
+ If 'True', entered shift_x, shift_y are ignored and set randomly. Default: `False`.
+ fill_value (int): value for the dropped pixels. Default = 0
+ mask_fill_value (int): value for the dropped pixels in mask.
+ If `None`, transformation is not applied to the mask. Default: `None`.
+
+ Targets:
+ image, mask
+
+ Image types:
+ uint8, float32
+
+ References:
+ https://arxiv.org/abs/2001.04086
+
+ """
+
+ def __init__(
+ self,
+ ratio: float = 0.5,
+ unit_size_min: Optional[int] = None,
+ unit_size_max: Optional[int] = None,
+ holes_number_x: Optional[int] = None,
+ holes_number_y: Optional[int] = None,
+ shift_x: int = 0,
+ shift_y: int = 0,
+ random_offset: bool = False,
+ fill_value: int = 0,
+ mask_fill_value: Optional[int] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(GridDropout, self).__init__(always_apply, p)
+ self.ratio = ratio
+ self.unit_size_min = unit_size_min
+ self.unit_size_max = unit_size_max
+ self.holes_number_x = holes_number_x
+ self.holes_number_y = holes_number_y
+ self.shift_x = shift_x
+ self.shift_y = shift_y
+ self.random_offset = random_offset
+ self.fill_value = fill_value
+ self.mask_fill_value = mask_fill_value
+ if not 0 < self.ratio <= 1:
+ raise ValueError("ratio must be between 0 and 1.")
+
+ def apply(self, img: np.ndarray, holes: Iterable[Tuple[int, int, int, int]] = (), **params) -> np.ndarray:
+ return F.cutout(img, holes, self.fill_value)
+
+ def apply_to_mask(self, img: np.ndarray, holes: Iterable[Tuple[int, int, int, int]] = (), **params) -> np.ndarray:
+ if self.mask_fill_value is None:
+ return img
+
+ return F.cutout(img, holes, self.mask_fill_value)
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ height, width = img.shape[:2]
+ # set grid using unit size limits
+ if self.unit_size_min and self.unit_size_max:
+ if not 2 <= self.unit_size_min <= self.unit_size_max:
+ raise ValueError("Max unit size should be >= min size, both at least 2 pixels.")
+ if self.unit_size_max > min(height, width):
+ raise ValueError("Grid size limits must be within the shortest image edge.")
+ unit_width = random.randint(self.unit_size_min, self.unit_size_max + 1)
+ unit_height = unit_width
+ else:
+ # set grid using holes numbers
+ if self.holes_number_x is None:
+ unit_width = max(2, width // 10)
+ else:
+ if not 1 <= self.holes_number_x <= width // 2:
+ raise ValueError("The hole_number_x must be between 1 and image width//2.")
+ unit_width = width // self.holes_number_x
+ if self.holes_number_y is None:
+ unit_height = max(min(unit_width, height), 2)
+ else:
+ if not 1 <= self.holes_number_y <= height // 2:
+ raise ValueError("The hole_number_y must be between 1 and image height//2.")
+ unit_height = height // self.holes_number_y
+
+ hole_width = int(unit_width * self.ratio)
+ hole_height = int(unit_height * self.ratio)
+ # min 1 pixel and max unit length - 1
+ hole_width = min(max(hole_width, 1), unit_width - 1)
+ hole_height = min(max(hole_height, 1), unit_height - 1)
+ # set offset of the grid
+ if self.shift_x is None:
+ shift_x = 0
+ else:
+ shift_x = min(max(0, self.shift_x), unit_width - hole_width)
+ if self.shift_y is None:
+ shift_y = 0
+ else:
+ shift_y = min(max(0, self.shift_y), unit_height - hole_height)
+ if self.random_offset:
+ shift_x = random.randint(0, unit_width - hole_width)
+ shift_y = random.randint(0, unit_height - hole_height)
+ holes = []
+ for i in range(width // unit_width + 1):
+ for j in range(height // unit_height + 1):
+ x1 = min(shift_x + unit_width * i, width)
+ y1 = min(shift_y + unit_height * j, height)
+ x2 = min(x1 + hole_width, width)
+ y2 = min(y1 + hole_height, height)
+ holes.append((x1, y1, x2, y2))
+
+ return {"holes": holes}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return (
+ "ratio",
+ "unit_size_min",
+ "unit_size_max",
+ "holes_number_x",
+ "holes_number_y",
+ "shift_x",
+ "shift_y",
+ "random_offset",
+ "fill_value",
+ "mask_fill_value",
+ )
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/mask_dropout.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/mask_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e9ec23de10434823217c30a296ba4170872bd58
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/dropout/mask_dropout.py
@@ -0,0 +1,99 @@
+import random
+from typing import Any, Dict, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+from skimage.measure import label
+
+from ...core.transforms_interface import DualTransform, to_tuple
+
+__all__ = ["MaskDropout"]
+
+
+class MaskDropout(DualTransform):
+ """
+ Image & mask augmentation that zero out mask and image regions corresponding
+ to randomly chosen object instance from mask.
+
+ Mask must be single-channel image, zero values treated as background.
+ Image can be any number of channels.
+
+ Inspired by https://www.kaggle.com/c/severstal-steel-defect-detection/discussion/114254
+
+ Args:
+ max_objects: Maximum number of labels that can be zeroed out. Can be tuple, in this case it's [min, max]
+ image_fill_value: Fill value to use when filling image.
+ Can be 'inpaint' to apply inpaining (works only for 3-chahnel images)
+ mask_fill_value: Fill value to use when filling mask.
+
+ Targets:
+ image, mask
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ max_objects: int = 1,
+ image_fill_value: Union[int, float, str] = 0,
+ mask_fill_value: Union[int, float] = 0,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(MaskDropout, self).__init__(always_apply, p)
+ self.max_objects = to_tuple(max_objects, 1)
+ self.image_fill_value = image_fill_value
+ self.mask_fill_value = mask_fill_value
+
+ @property
+ def targets_as_params(self):
+ return ["mask"]
+
+ def get_params_dependent_on_targets(self, params) -> Dict[str, Any]:
+ mask = params["mask"]
+
+ label_image, num_labels = label(mask, return_num=True)
+
+ if num_labels == 0:
+ dropout_mask = None
+ else:
+ objects_to_drop = random.randint(int(self.max_objects[0]), int(self.max_objects[1]))
+ objects_to_drop = min(num_labels, objects_to_drop)
+
+ if objects_to_drop == num_labels:
+ dropout_mask = mask > 0
+ else:
+ labels_index = random.sample(range(1, num_labels + 1), objects_to_drop)
+ dropout_mask = np.zeros((mask.shape[0], mask.shape[1]), dtype=bool)
+ for label_index in labels_index:
+ dropout_mask |= label_image == label_index
+
+ params.update({"dropout_mask": dropout_mask})
+ return params
+
+ def apply(self, img: np.ndarray, dropout_mask: Optional[np.ndarray] = None, **params) -> np.ndarray:
+ if dropout_mask is None:
+ return img
+
+ if self.image_fill_value == "inpaint":
+ dropout_mask = dropout_mask.astype(np.uint8)
+ _, _, w, h = cv2.boundingRect(dropout_mask)
+ radius = min(3, max(w, h) // 2)
+ img = cv2.inpaint(img, dropout_mask, radius, cv2.INPAINT_NS)
+ else:
+ img = img.copy()
+ img[dropout_mask] = self.image_fill_value
+
+ return img
+
+ def apply_to_mask(self, img: np.ndarray, dropout_mask: Optional[np.ndarray] = None, **params) -> np.ndarray:
+ if dropout_mask is None:
+ return img
+
+ img = img.copy()
+ img[dropout_mask] = self.mask_fill_value
+ return img
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return "max_objects", "image_fill_value", "mask_fill_value"
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/functional.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..1056b3c24a1581b3cdda4c7e6fb07d53da7b66e3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/functional.py
@@ -0,0 +1,1380 @@
+from __future__ import division
+
+from typing import Optional, Sequence, Union
+from warnings import warn
+
+import cv2
+import numpy as np
+import skimage
+
+from custom_albumentations import random_utils
+from custom_albumentations.augmentations.utils import (
+ MAX_VALUES_BY_DTYPE,
+ _maybe_process_in_chunks,
+ clip,
+ clipped,
+ ensure_contiguous,
+ is_grayscale_image,
+ is_rgb_image,
+ non_rgb_warning,
+ preserve_channel_dim,
+ preserve_shape,
+)
+
+__all__ = [
+ "add_fog",
+ "add_rain",
+ "add_shadow",
+ "add_gravel",
+ "add_snow",
+ "add_sun_flare",
+ "add_weighted",
+ "adjust_brightness_torchvision",
+ "adjust_contrast_torchvision",
+ "adjust_hue_torchvision",
+ "adjust_saturation_torchvision",
+ "brightness_contrast_adjust",
+ "channel_shuffle",
+ "clahe",
+ "convolve",
+ "downscale",
+ "equalize",
+ "fancy_pca",
+ "from_float",
+ "gamma_transform",
+ "gauss_noise",
+ "image_compression",
+ "invert",
+ "iso_noise",
+ "linear_transformation_rgb",
+ "move_tone_curve",
+ "multiply",
+ "noop",
+ "normalize",
+ "posterize",
+ "shift_hsv",
+ "shift_rgb",
+ "solarize",
+ "superpixels",
+ "swap_tiles_on_image",
+ "to_float",
+ "to_gray",
+ "gray_to_rgb",
+ "unsharp_mask",
+]
+
+
+def normalize_cv2(img, mean, denominator):
+ if mean.shape and len(mean) != 4 and mean.shape != img.shape:
+ mean = np.array(mean.tolist() + [0] * (4 - len(mean)), dtype=np.float64)
+ if not denominator.shape:
+ denominator = np.array([denominator.tolist()] * 4, dtype=np.float64)
+ elif len(denominator) != 4 and denominator.shape != img.shape:
+ denominator = np.array(denominator.tolist() + [1] * (4 - len(denominator)), dtype=np.float64)
+
+ img = np.ascontiguousarray(img.astype("float32"))
+ cv2.subtract(img, mean.astype(np.float64), img)
+ cv2.multiply(img, denominator.astype(np.float64), img)
+ return img
+
+
+def normalize_numpy(img, mean, denominator):
+ img = img.astype(np.float32)
+ img -= mean
+ img *= denominator
+ return img
+
+
+def normalize(img, mean, std, max_pixel_value=255.0):
+ mean = np.array(mean, dtype=np.float32)
+ mean *= max_pixel_value
+
+ std = np.array(std, dtype=np.float32)
+ std *= max_pixel_value
+
+ denominator = np.reciprocal(std, dtype=np.float32)
+
+ if img.ndim == 3 and img.shape[-1] == 3:
+ return normalize_cv2(img, mean, denominator)
+ return normalize_numpy(img, mean, denominator)
+
+
+def _shift_hsv_uint8(img, hue_shift, sat_shift, val_shift):
+ dtype = img.dtype
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+ hue, sat, val = cv2.split(img)
+
+ if hue_shift != 0:
+ lut_hue = np.arange(0, 256, dtype=np.int16)
+ lut_hue = np.mod(lut_hue + hue_shift, 180).astype(dtype)
+ hue = cv2.LUT(hue, lut_hue)
+
+ if sat_shift != 0:
+ lut_sat = np.arange(0, 256, dtype=np.int16)
+ lut_sat = np.clip(lut_sat + sat_shift, 0, 255).astype(dtype)
+ sat = cv2.LUT(sat, lut_sat)
+
+ if val_shift != 0:
+ lut_val = np.arange(0, 256, dtype=np.int16)
+ lut_val = np.clip(lut_val + val_shift, 0, 255).astype(dtype)
+ val = cv2.LUT(val, lut_val)
+
+ img = cv2.merge((hue, sat, val)).astype(dtype)
+ img = cv2.cvtColor(img, cv2.COLOR_HSV2RGB)
+ return img
+
+
+def _shift_hsv_non_uint8(img, hue_shift, sat_shift, val_shift):
+ dtype = img.dtype
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+ hue, sat, val = cv2.split(img)
+
+ if hue_shift != 0:
+ hue = cv2.add(hue, hue_shift)
+ hue = np.mod(hue, 360) # OpenCV fails with negative values
+
+ if sat_shift != 0:
+ sat = clip(cv2.add(sat, sat_shift), dtype, 1.0)
+
+ if val_shift != 0:
+ val = clip(cv2.add(val, val_shift), dtype, 1.0)
+
+ img = cv2.merge((hue, sat, val))
+ img = cv2.cvtColor(img, cv2.COLOR_HSV2RGB)
+ return img
+
+
+@preserve_shape
+def shift_hsv(img, hue_shift, sat_shift, val_shift):
+ if hue_shift == 0 and sat_shift == 0 and val_shift == 0:
+ return img
+
+ is_gray = is_grayscale_image(img)
+ if is_gray:
+ if hue_shift != 0 or sat_shift != 0:
+ hue_shift = 0
+ sat_shift = 0
+ warn(
+ "HueSaturationValue: hue_shift and sat_shift are not applicable to grayscale image. "
+ "Set them to 0 or use RGB image"
+ )
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+
+ if img.dtype == np.uint8:
+ img = _shift_hsv_uint8(img, hue_shift, sat_shift, val_shift)
+ else:
+ img = _shift_hsv_non_uint8(img, hue_shift, sat_shift, val_shift)
+
+ if is_gray:
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+
+ return img
+
+
+def solarize(img, threshold=128):
+ """Invert all pixel values above a threshold.
+
+ Args:
+ img (numpy.ndarray): The image to solarize.
+ threshold (int): All pixels above this greyscale level are inverted.
+
+ Returns:
+ numpy.ndarray: Solarized image.
+
+ """
+ dtype = img.dtype
+ max_val = MAX_VALUES_BY_DTYPE[dtype]
+
+ if dtype == np.dtype("uint8"):
+ lut = [(i if i < threshold else max_val - i) for i in range(max_val + 1)]
+
+ prev_shape = img.shape
+ img = cv2.LUT(img, np.array(lut, dtype=dtype))
+
+ if len(prev_shape) != len(img.shape):
+ img = np.expand_dims(img, -1)
+ return img
+
+ result_img = img.copy()
+ cond = img >= threshold
+ result_img[cond] = max_val - result_img[cond]
+ return result_img
+
+
+@preserve_shape
+def posterize(img, bits):
+ """Reduce the number of bits for each color channel.
+
+ Args:
+ img (numpy.ndarray): image to posterize.
+ bits (int): number of high bits. Must be in range [0, 8]
+
+ Returns:
+ numpy.ndarray: Image with reduced color channels.
+
+ """
+ bits = np.uint8(bits)
+
+ if img.dtype != np.uint8:
+ raise TypeError("Image must have uint8 channel type")
+ if np.any((bits < 0) | (bits > 8)):
+ raise ValueError("bits must be in range [0, 8]")
+
+ if not bits.shape or len(bits) == 1:
+ if bits == 0:
+ return np.zeros_like(img)
+ if bits == 8:
+ return img.copy()
+
+ lut = np.arange(0, 256, dtype=np.uint8)
+ mask = ~np.uint8(2 ** (8 - bits) - 1)
+ lut &= mask
+
+ return cv2.LUT(img, lut)
+
+ if not is_rgb_image(img):
+ raise TypeError("If bits is iterable image must be RGB")
+
+ result_img = np.empty_like(img)
+ for i, channel_bits in enumerate(bits):
+ if channel_bits == 0:
+ result_img[..., i] = np.zeros_like(img[..., i])
+ elif channel_bits == 8:
+ result_img[..., i] = img[..., i].copy()
+ else:
+ lut = np.arange(0, 256, dtype=np.uint8)
+ mask = ~np.uint8(2 ** (8 - channel_bits) - 1)
+ lut &= mask
+
+ result_img[..., i] = cv2.LUT(img[..., i], lut)
+
+ return result_img
+
+
+def _equalize_pil(img, mask=None):
+ histogram = cv2.calcHist([img], [0], mask, [256], (0, 256)).ravel()
+ h = [_f for _f in histogram if _f]
+
+ if len(h) <= 1:
+ return img.copy()
+
+ step = np.sum(h[:-1]) // 255
+ if not step:
+ return img.copy()
+
+ lut = np.empty(256, dtype=np.uint8)
+ n = step // 2
+ for i in range(256):
+ lut[i] = min(n // step, 255)
+ n += histogram[i]
+
+ return cv2.LUT(img, np.array(lut))
+
+
+def _equalize_cv(img, mask=None):
+ if mask is None:
+ return cv2.equalizeHist(img)
+
+ histogram = cv2.calcHist([img], [0], mask, [256], (0, 256)).ravel()
+ i = 0
+ for val in histogram:
+ if val > 0:
+ break
+ i += 1
+ i = min(i, 255)
+
+ total = np.sum(histogram)
+ if histogram[i] == total:
+ return np.full_like(img, i)
+
+ scale = 255.0 / (total - histogram[i])
+ _sum = 0
+
+ lut = np.zeros(256, dtype=np.uint8)
+ i += 1
+ for i in range(i, len(histogram)):
+ _sum += histogram[i]
+ lut[i] = clip(round(_sum * scale), np.dtype("uint8"), 255)
+
+ return cv2.LUT(img, lut)
+
+
+@preserve_channel_dim
+def equalize(img, mask=None, mode="cv", by_channels=True):
+ """Equalize the image histogram.
+
+ Args:
+ img (numpy.ndarray): RGB or grayscale image.
+ mask (numpy.ndarray): An optional mask. If given, only the pixels selected by
+ the mask are included in the analysis. Maybe 1 channel or 3 channel array.
+ mode (str): {'cv', 'pil'}. Use OpenCV or Pillow equalization method.
+ by_channels (bool): If True, use equalization by channels separately,
+ else convert image to YCbCr representation and use equalization by `Y` channel.
+
+ Returns:
+ numpy.ndarray: Equalized image.
+
+ """
+ if img.dtype != np.uint8:
+ raise TypeError("Image must have uint8 channel type")
+
+ modes = ["cv", "pil"]
+
+ if mode not in modes:
+ raise ValueError("Unsupported equalization mode. Supports: {}. " "Got: {}".format(modes, mode))
+ if mask is not None:
+ if is_rgb_image(mask) and is_grayscale_image(img):
+ raise ValueError("Wrong mask shape. Image shape: {}. " "Mask shape: {}".format(img.shape, mask.shape))
+ if not by_channels and not is_grayscale_image(mask):
+ raise ValueError(
+ "When by_channels=False only 1-channel mask supports. " "Mask shape: {}".format(mask.shape)
+ )
+
+ if mode == "pil":
+ function = _equalize_pil
+ else:
+ function = _equalize_cv
+
+ if mask is not None:
+ mask = mask.astype(np.uint8)
+
+ if is_grayscale_image(img):
+ return function(img, mask)
+
+ if not by_channels:
+ result_img = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
+ result_img[..., 0] = function(result_img[..., 0], mask)
+ return cv2.cvtColor(result_img, cv2.COLOR_YCrCb2RGB)
+
+ result_img = np.empty_like(img)
+ for i in range(3):
+ if mask is None:
+ _mask = None
+ elif is_grayscale_image(mask):
+ _mask = mask
+ else:
+ _mask = mask[..., i]
+
+ result_img[..., i] = function(img[..., i], _mask)
+
+ return result_img
+
+
+@preserve_shape
+def move_tone_curve(img, low_y, high_y):
+ """Rescales the relationship between bright and dark areas of the image by manipulating its tone curve.
+
+ Args:
+ img (numpy.ndarray): RGB or grayscale image.
+ low_y (float): y-position of a Bezier control point used
+ to adjust the tone curve, must be in range [0, 1]
+ high_y (float): y-position of a Bezier control point used
+ to adjust image tone curve, must be in range [0, 1]
+ """
+ input_dtype = img.dtype
+
+ if low_y < 0 or low_y > 1:
+ raise ValueError("low_shift must be in range [0, 1]")
+ if high_y < 0 or high_y > 1:
+ raise ValueError("high_shift must be in range [0, 1]")
+
+ if input_dtype != np.uint8:
+ raise ValueError("Unsupported image type {}".format(input_dtype))
+
+ t = np.linspace(0.0, 1.0, 256)
+
+ # Defines responze of a four-point bezier curve
+ def evaluate_bez(t):
+ return 3 * (1 - t) ** 2 * t * low_y + 3 * (1 - t) * t**2 * high_y + t**3
+
+ evaluate_bez = np.vectorize(evaluate_bez)
+ remapping = np.rint(evaluate_bez(t) * 255).astype(np.uint8)
+
+ lut_fn = _maybe_process_in_chunks(cv2.LUT, lut=remapping)
+ img = lut_fn(img)
+ return img
+
+
+@clipped
+def _shift_rgb_non_uint8(img, r_shift, g_shift, b_shift):
+ if r_shift == g_shift == b_shift:
+ return img + r_shift
+
+ result_img = np.empty_like(img)
+ shifts = [r_shift, g_shift, b_shift]
+ for i, shift in enumerate(shifts):
+ result_img[..., i] = img[..., i] + shift
+
+ return result_img
+
+
+def _shift_image_uint8(img, value):
+ max_value = MAX_VALUES_BY_DTYPE[img.dtype]
+
+ lut = np.arange(0, max_value + 1).astype("float32")
+ lut += value
+
+ lut = np.clip(lut, 0, max_value).astype(img.dtype)
+ return cv2.LUT(img, lut)
+
+
+@preserve_shape
+def _shift_rgb_uint8(img, r_shift, g_shift, b_shift):
+ if r_shift == g_shift == b_shift:
+ h, w, c = img.shape
+ img = img.reshape([h, w * c])
+
+ return _shift_image_uint8(img, r_shift)
+
+ result_img = np.empty_like(img)
+ shifts = [r_shift, g_shift, b_shift]
+ for i, shift in enumerate(shifts):
+ result_img[..., i] = _shift_image_uint8(img[..., i], shift)
+
+ return result_img
+
+
+def shift_rgb(img, r_shift, g_shift, b_shift):
+ if img.dtype == np.uint8:
+ return _shift_rgb_uint8(img, r_shift, g_shift, b_shift)
+
+ return _shift_rgb_non_uint8(img, r_shift, g_shift, b_shift)
+
+
+@clipped
+def linear_transformation_rgb(img, transformation_matrix):
+ result_img = cv2.transform(img, transformation_matrix)
+
+ return result_img
+
+
+@preserve_channel_dim
+def clahe(img, clip_limit=2.0, tile_grid_size=(8, 8)):
+ if img.dtype != np.uint8:
+ raise TypeError("clahe supports only uint8 inputs")
+
+ clahe_mat = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
+
+ if len(img.shape) == 2 or img.shape[2] == 1:
+ img = clahe_mat.apply(img)
+ else:
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
+ img[:, :, 0] = clahe_mat.apply(img[:, :, 0])
+ img = cv2.cvtColor(img, cv2.COLOR_LAB2RGB)
+
+ return img
+
+
+@preserve_shape
+def convolve(img, kernel):
+ conv_fn = _maybe_process_in_chunks(cv2.filter2D, ddepth=-1, kernel=kernel)
+ return conv_fn(img)
+
+
+@preserve_shape
+def image_compression(img, quality, image_type):
+ if image_type in [".jpeg", ".jpg"]:
+ quality_flag = cv2.IMWRITE_JPEG_QUALITY
+ elif image_type == ".webp":
+ quality_flag = cv2.IMWRITE_WEBP_QUALITY
+ else:
+ NotImplementedError("Only '.jpg' and '.webp' compression transforms are implemented. ")
+
+ input_dtype = img.dtype
+ needs_float = False
+
+ if input_dtype == np.float32:
+ warn(
+ "Image compression augmentation "
+ "is most effective with uint8 inputs, "
+ "{} is used as input.".format(input_dtype),
+ UserWarning,
+ )
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for image augmentation".format(input_dtype))
+
+ _, encoded_img = cv2.imencode(image_type, img, (int(quality_flag), quality))
+ img = cv2.imdecode(encoded_img, cv2.IMREAD_UNCHANGED)
+
+ if needs_float:
+ img = to_float(img, max_value=255)
+ return img
+
+
+@preserve_shape
+def add_snow(img, snow_point, brightness_coeff):
+ """Bleaches out pixels, imitation snow.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ img (numpy.ndarray): Image.
+ snow_point: Number of show points.
+ brightness_coeff: Brightness coefficient.
+
+ Returns:
+ numpy.ndarray: Image.
+
+ """
+ non_rgb_warning(img)
+
+ input_dtype = img.dtype
+ needs_float = False
+
+ snow_point *= 127.5 # = 255 / 2
+ snow_point += 85 # = 255 / 3
+
+ if input_dtype == np.float32:
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for RandomSnow augmentation".format(input_dtype))
+
+ image_HLS = cv2.cvtColor(img, cv2.COLOR_RGB2HLS)
+ image_HLS = np.array(image_HLS, dtype=np.float32)
+
+ image_HLS[:, :, 1][image_HLS[:, :, 1] < snow_point] *= brightness_coeff
+
+ image_HLS[:, :, 1] = clip(image_HLS[:, :, 1], np.uint8, 255)
+
+ image_HLS = np.array(image_HLS, dtype=np.uint8)
+
+ image_RGB = cv2.cvtColor(image_HLS, cv2.COLOR_HLS2RGB)
+
+ if needs_float:
+ image_RGB = to_float(image_RGB, max_value=255)
+
+ return image_RGB
+
+
+@preserve_shape
+def add_rain(
+ img,
+ slant,
+ drop_length,
+ drop_width,
+ drop_color,
+ blur_value,
+ brightness_coefficient,
+ rain_drops,
+):
+ """
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ img (numpy.ndarray): Image.
+ slant (int):
+ drop_length:
+ drop_width:
+ drop_color:
+ blur_value (int): Rainy view are blurry.
+ brightness_coefficient (float): Rainy days are usually shady.
+ rain_drops:
+
+ Returns:
+ numpy.ndarray: Image.
+
+ """
+ non_rgb_warning(img)
+
+ input_dtype = img.dtype
+ needs_float = False
+
+ if input_dtype == np.float32:
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for RandomRain augmentation".format(input_dtype))
+
+ image = img.copy()
+
+ for rain_drop_x0, rain_drop_y0 in rain_drops:
+ rain_drop_x1 = rain_drop_x0 + slant
+ rain_drop_y1 = rain_drop_y0 + drop_length
+
+ cv2.line(
+ image,
+ (rain_drop_x0, rain_drop_y0),
+ (rain_drop_x1, rain_drop_y1),
+ drop_color,
+ drop_width,
+ )
+
+ image = cv2.blur(image, (blur_value, blur_value)) # rainy view are blurry
+ image_hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV).astype(np.float32)
+ image_hsv[:, :, 2] *= brightness_coefficient
+
+ image_rgb = cv2.cvtColor(image_hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)
+
+ if needs_float:
+ image_rgb = to_float(image_rgb, max_value=255)
+
+ return image_rgb
+
+
+@preserve_shape
+def add_fog(img, fog_coef, alpha_coef, haze_list):
+ """Add fog to the image.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ img (numpy.ndarray): Image.
+ fog_coef (float): Fog coefficient.
+ alpha_coef (float): Alpha coefficient.
+ haze_list (list):
+
+ Returns:
+ numpy.ndarray: Image.
+
+ """
+ non_rgb_warning(img)
+
+ input_dtype = img.dtype
+ needs_float = False
+
+ if input_dtype == np.float32:
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for RandomFog augmentation".format(input_dtype))
+
+ width = img.shape[1]
+
+ hw = max(int(width // 3 * fog_coef), 10)
+
+ for haze_points in haze_list:
+ x, y = haze_points
+ overlay = img.copy()
+ output = img.copy()
+ alpha = alpha_coef * fog_coef
+ rad = hw // 2
+ point = (x + hw // 2, y + hw // 2)
+ cv2.circle(overlay, point, int(rad), (255, 255, 255), -1)
+ cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
+
+ img = output.copy()
+
+ image_rgb = cv2.blur(img, (hw // 10, hw // 10))
+
+ if needs_float:
+ image_rgb = to_float(image_rgb, max_value=255)
+
+ return image_rgb
+
+
+@preserve_shape
+def add_sun_flare(img, flare_center_x, flare_center_y, src_radius, src_color, circles):
+ """Add sun flare.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ img (numpy.ndarray):
+ flare_center_x (float):
+ flare_center_y (float):
+ src_radius:
+ src_color (int, int, int):
+ circles (list):
+
+ Returns:
+ numpy.ndarray:
+
+ """
+ non_rgb_warning(img)
+
+ input_dtype = img.dtype
+ needs_float = False
+
+ if input_dtype == np.float32:
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for RandomSunFlareaugmentation".format(input_dtype))
+
+ overlay = img.copy()
+ output = img.copy()
+
+ for alpha, (x, y), rad3, (r_color, g_color, b_color) in circles:
+ cv2.circle(overlay, (x, y), rad3, (r_color, g_color, b_color), -1)
+
+ cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
+
+ point = (int(flare_center_x), int(flare_center_y))
+
+ overlay = output.copy()
+ num_times = src_radius // 10
+ alpha = np.linspace(0.0, 1, num=num_times)
+ rad = np.linspace(1, src_radius, num=num_times)
+ for i in range(num_times):
+ cv2.circle(overlay, point, int(rad[i]), src_color, -1)
+ alp = alpha[num_times - i - 1] * alpha[num_times - i - 1] * alpha[num_times - i - 1]
+ cv2.addWeighted(overlay, alp, output, 1 - alp, 0, output)
+
+ image_rgb = output
+
+ if needs_float:
+ image_rgb = to_float(image_rgb, max_value=255)
+
+ return image_rgb
+
+
+@ensure_contiguous
+@preserve_shape
+def add_shadow(img, vertices_list):
+ """Add shadows to the image.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ img (numpy.ndarray):
+ vertices_list (list):
+
+ Returns:
+ numpy.ndarray:
+
+ """
+ non_rgb_warning(img)
+ input_dtype = img.dtype
+ needs_float = False
+
+ if input_dtype == np.float32:
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for RandomShadow augmentation".format(input_dtype))
+
+ image_hls = cv2.cvtColor(img, cv2.COLOR_RGB2HLS)
+ mask = np.zeros_like(img)
+
+ # adding all shadow polygons on empty mask, single 255 denotes only red channel
+ for vertices in vertices_list:
+ cv2.fillPoly(mask, vertices, 255)
+
+ # if red channel is hot, image's "Lightness" channel's brightness is lowered
+ red_max_value_ind = mask[:, :, 0] == 255
+ image_hls[:, :, 1][red_max_value_ind] = image_hls[:, :, 1][red_max_value_ind] * 0.5
+
+ image_rgb = cv2.cvtColor(image_hls, cv2.COLOR_HLS2RGB)
+
+ if needs_float:
+ image_rgb = to_float(image_rgb, max_value=255)
+
+ return image_rgb
+
+
+@ensure_contiguous
+@preserve_shape
+def add_gravel(img: np.ndarray, gravels: list):
+ """Add gravel to the image.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ img (numpy.ndarray): image to add gravel to
+ gravels (list): list of gravel parameters. (float, float, float, float):
+ (top-left x, top-left y, bottom-right x, bottom right y)
+
+ Returns:
+ numpy.ndarray:
+ """
+ non_rgb_warning(img)
+ input_dtype = img.dtype
+ needs_float = False
+
+ if input_dtype == np.float32:
+ img = from_float(img, dtype=np.dtype("uint8"))
+ needs_float = True
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for AddGravel augmentation".format(input_dtype))
+
+ image_hls = cv2.cvtColor(img, cv2.COLOR_RGB2HLS)
+
+ for gravel in gravels:
+ y1, y2, x1, x2, sat = gravel
+ image_hls[x1:x2, y1:y2, 1] = sat
+
+ image_rgb = cv2.cvtColor(image_hls, cv2.COLOR_HLS2RGB)
+
+ if needs_float:
+ image_rgb = to_float(image_rgb, max_value=255)
+
+ return image_rgb
+
+
+def invert(img: np.ndarray) -> np.ndarray:
+ # Supports all the valid dtypes
+ # clips the img to avoid unexpected behaviour.
+ return MAX_VALUES_BY_DTYPE[img.dtype] - img
+
+
+def channel_shuffle(img, channels_shuffled):
+ img = img[..., channels_shuffled]
+ return img
+
+
+@preserve_shape
+def gamma_transform(img, gamma):
+ if img.dtype == np.uint8:
+ table = (np.arange(0, 256.0 / 255, 1.0 / 255) ** gamma) * 255
+ img = cv2.LUT(img, table.astype(np.uint8))
+ else:
+ img = np.power(img, gamma)
+
+ return img
+
+
+@clipped
+def gauss_noise(image, gauss):
+ image = image.astype("float32")
+ return image + gauss
+
+
+@clipped
+def _brightness_contrast_adjust_non_uint(img, alpha=1, beta=0, beta_by_max=False):
+ dtype = img.dtype
+ img = img.astype("float32")
+
+ if alpha != 1:
+ img *= alpha
+ if beta != 0:
+ if beta_by_max:
+ max_value = MAX_VALUES_BY_DTYPE[dtype]
+ img += beta * max_value
+ else:
+ img += beta * np.mean(img)
+ return img
+
+
+@preserve_shape
+def _brightness_contrast_adjust_uint(img, alpha=1, beta=0, beta_by_max=False):
+ dtype = np.dtype("uint8")
+
+ max_value = MAX_VALUES_BY_DTYPE[dtype]
+
+ lut = np.arange(0, max_value + 1).astype("float32")
+
+ if alpha != 1:
+ lut *= alpha
+ if beta != 0:
+ if beta_by_max:
+ lut += beta * max_value
+ else:
+ lut += (alpha * beta) * np.mean(img)
+
+ lut = np.clip(lut, 0, max_value).astype(dtype)
+ img = cv2.LUT(img, lut)
+ return img
+
+
+def brightness_contrast_adjust(img, alpha=1, beta=0, beta_by_max=False):
+ if img.dtype == np.uint8:
+ return _brightness_contrast_adjust_uint(img, alpha, beta, beta_by_max)
+
+ return _brightness_contrast_adjust_non_uint(img, alpha, beta, beta_by_max)
+
+
+@clipped
+def iso_noise(image, color_shift=0.05, intensity=0.5, random_state=None, **kwargs):
+ """
+ Apply poisson noise to image to simulate camera sensor noise.
+
+ Args:
+ image (numpy.ndarray): Input image, currently, only RGB, uint8 images are supported.
+ color_shift (float):
+ intensity (float): Multiplication factor for noise values. Values of ~0.5 are produce noticeable,
+ yet acceptable level of noise.
+ random_state:
+ **kwargs:
+
+ Returns:
+ numpy.ndarray: Noised image
+
+ """
+ if image.dtype != np.uint8:
+ raise TypeError("Image must have uint8 channel type")
+ if not is_rgb_image(image):
+ raise TypeError("Image must be RGB")
+
+ one_over_255 = float(1.0 / 255.0)
+ image = np.multiply(image, one_over_255, dtype=np.float32)
+ hls = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
+ _, stddev = cv2.meanStdDev(hls)
+
+ luminance_noise = random_utils.poisson(stddev[1] * intensity * 255, size=hls.shape[:2], random_state=random_state)
+ color_noise = random_utils.normal(0, color_shift * 360 * intensity, size=hls.shape[:2], random_state=random_state)
+
+ hue = hls[..., 0]
+ hue += color_noise
+ hue[hue < 0] += 360
+ hue[hue > 360] -= 360
+
+ luminance = hls[..., 1]
+ luminance += (luminance_noise / 255) * (1.0 - luminance)
+
+ image = cv2.cvtColor(hls, cv2.COLOR_HLS2RGB) * 255
+ return image.astype(np.uint8)
+
+
+def to_gray(img):
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+ return cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
+
+
+def gray_to_rgb(img):
+ return cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+
+
+@preserve_shape
+def downscale(img, scale, down_interpolation=cv2.INTER_AREA, up_interpolation=cv2.INTER_LINEAR):
+ h, w = img.shape[:2]
+
+ need_cast = (
+ up_interpolation != cv2.INTER_NEAREST or down_interpolation != cv2.INTER_NEAREST
+ ) and img.dtype == np.uint8
+ if need_cast:
+ img = to_float(img)
+ downscaled = cv2.resize(img, None, fx=scale, fy=scale, interpolation=down_interpolation)
+ upscaled = cv2.resize(downscaled, (w, h), interpolation=up_interpolation)
+ if need_cast:
+ upscaled = from_float(np.clip(upscaled, 0, 1), dtype=np.dtype("uint8"))
+ return upscaled
+
+
+def to_float(img, max_value=None):
+ if max_value is None:
+ try:
+ max_value = MAX_VALUES_BY_DTYPE[img.dtype]
+ except KeyError:
+ raise RuntimeError(
+ "Can't infer the maximum value for dtype {}. You need to specify the maximum value manually by "
+ "passing the max_value argument".format(img.dtype)
+ )
+ return img.astype("float32") / max_value
+
+
+def from_float(img, dtype, max_value=None):
+ if max_value is None:
+ try:
+ max_value = MAX_VALUES_BY_DTYPE[dtype]
+ except KeyError:
+ raise RuntimeError(
+ "Can't infer the maximum value for dtype {}. You need to specify the maximum value manually by "
+ "passing the max_value argument".format(dtype)
+ )
+ return (img * max_value).astype(dtype)
+
+
+def noop(input_obj, **params): # skipcq: PYL-W0613
+ return input_obj
+
+
+def swap_tiles_on_image(image, tiles):
+ """
+ Swap tiles on image.
+
+ Args:
+ image (np.ndarray): Input image.
+ tiles (np.ndarray): array of tuples(
+ current_left_up_corner_row, current_left_up_corner_col,
+ old_left_up_corner_row, old_left_up_corner_col,
+ height_tile, width_tile)
+
+ Returns:
+ np.ndarray: Output image.
+
+ """
+ new_image = image.copy()
+
+ for tile in tiles:
+ new_image[tile[0] : tile[0] + tile[4], tile[1] : tile[1] + tile[5]] = image[
+ tile[2] : tile[2] + tile[4], tile[3] : tile[3] + tile[5]
+ ]
+
+ return new_image
+
+
+@clipped
+def _multiply_uint8(img, multiplier):
+ img = img.astype(np.float32)
+ return np.multiply(img, multiplier)
+
+
+@preserve_shape
+def _multiply_uint8_optimized(img, multiplier):
+ if is_grayscale_image(img) or len(multiplier) == 1:
+ multiplier = multiplier[0]
+ lut = np.arange(0, 256, dtype=np.float32)
+ lut *= multiplier
+ lut = clip(lut, np.uint8, MAX_VALUES_BY_DTYPE[img.dtype])
+ func = _maybe_process_in_chunks(cv2.LUT, lut=lut)
+ return func(img)
+
+ channels = img.shape[-1]
+ lut = [np.arange(0, 256, dtype=np.float32)] * channels
+ lut = np.stack(lut, axis=-1)
+
+ lut *= multiplier
+ lut = clip(lut, np.uint8, MAX_VALUES_BY_DTYPE[img.dtype])
+
+ images = []
+ for i in range(channels):
+ func = _maybe_process_in_chunks(cv2.LUT, lut=lut[:, i])
+ images.append(func(img[:, :, i]))
+ return np.stack(images, axis=-1)
+
+
+@clipped
+def _multiply_non_uint8(img, multiplier):
+ return img * multiplier
+
+
+def multiply(img, multiplier):
+ """
+ Args:
+ img (numpy.ndarray): Image.
+ multiplier (numpy.ndarray): Multiplier coefficient.
+
+ Returns:
+ numpy.ndarray: Image multiplied by `multiplier` coefficient.
+
+ """
+ if img.dtype == np.uint8:
+ if len(multiplier.shape) == 1:
+ return _multiply_uint8_optimized(img, multiplier)
+
+ return _multiply_uint8(img, multiplier)
+
+ return _multiply_non_uint8(img, multiplier)
+
+
+def bbox_from_mask(mask):
+ """Create bounding box from binary mask (fast version)
+
+ Args:
+ mask (numpy.ndarray): binary mask.
+
+ Returns:
+ tuple: A bounding box tuple `(x_min, y_min, x_max, y_max)`.
+
+ """
+ rows = np.any(mask, axis=1)
+ if not rows.any():
+ return -1, -1, -1, -1
+ cols = np.any(mask, axis=0)
+ y_min, y_max = np.where(rows)[0][[0, -1]]
+ x_min, x_max = np.where(cols)[0][[0, -1]]
+ return x_min, y_min, x_max + 1, y_max + 1
+
+
+def mask_from_bbox(img, bbox):
+ """Create binary mask from bounding box
+
+ Args:
+ img (numpy.ndarray): input image
+ bbox: A bounding box tuple `(x_min, y_min, x_max, y_max)`
+
+ Returns:
+ mask (numpy.ndarray): binary mask
+
+ """
+
+ mask = np.zeros(img.shape[:2], dtype=np.uint8)
+ x_min, y_min, x_max, y_max = bbox
+ mask[y_min:y_max, x_min:x_max] = 1
+ return mask
+
+
+def fancy_pca(img, alpha=0.1):
+ """Perform 'Fancy PCA' augmentation from:
+ http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+
+ Args:
+ img (numpy.ndarray): numpy array with (h, w, rgb) shape, as ints between 0-255
+ alpha (float): how much to perturb/scale the eigen vecs and vals
+ the paper used std=0.1
+
+ Returns:
+ numpy.ndarray: numpy image-like array as uint8 range(0, 255)
+
+ """
+ if not is_rgb_image(img) or img.dtype != np.uint8:
+ raise TypeError("Image must be RGB image in uint8 format.")
+
+ orig_img = img.astype(float).copy()
+
+ img = img / 255.0 # rescale to 0 to 1 range
+
+ # flatten image to columns of RGB
+ img_rs = img.reshape(-1, 3)
+ # img_rs shape (640000, 3)
+
+ # center mean
+ img_centered = img_rs - np.mean(img_rs, axis=0)
+
+ # paper says 3x3 covariance matrix
+ img_cov = np.cov(img_centered, rowvar=False)
+
+ # eigen values and eigen vectors
+ eig_vals, eig_vecs = np.linalg.eigh(img_cov)
+
+ # sort values and vector
+ sort_perm = eig_vals[::-1].argsort()
+ eig_vals[::-1].sort()
+ eig_vecs = eig_vecs[:, sort_perm]
+
+ # get [p1, p2, p3]
+ m1 = np.column_stack((eig_vecs))
+
+ # get 3x1 matrix of eigen values multiplied by random variable draw from normal
+ # distribution with mean of 0 and standard deviation of 0.1
+ m2 = np.zeros((3, 1))
+ # according to the paper alpha should only be draw once per augmentation (not once per channel)
+ # alpha = np.random.normal(0, alpha_std)
+
+ # broad cast to speed things up
+ m2[:, 0] = alpha * eig_vals[:]
+
+ # this is the vector that we're going to add to each pixel in a moment
+ add_vect = np.matrix(m1) * np.matrix(m2)
+
+ for idx in range(3): # RGB
+ orig_img[..., idx] += add_vect[idx] * 255
+
+ # for image processing it was found that working with float 0.0 to 1.0
+ # was easier than integers between 0-255
+ # orig_img /= 255.0
+ orig_img = np.clip(orig_img, 0.0, 255.0)
+
+ # orig_img *= 255
+ orig_img = orig_img.astype(np.uint8)
+
+ return orig_img
+
+
+def _adjust_brightness_torchvision_uint8(img, factor):
+ lut = np.arange(0, 256) * factor
+ lut = np.clip(lut, 0, 255).astype(np.uint8)
+ return cv2.LUT(img, lut)
+
+
+@preserve_shape
+def adjust_brightness_torchvision(img, factor):
+ if factor == 0:
+ return np.zeros_like(img)
+ elif factor == 1:
+ return img
+
+ if img.dtype == np.uint8:
+ return _adjust_brightness_torchvision_uint8(img, factor)
+
+ return clip(img * factor, img.dtype, MAX_VALUES_BY_DTYPE[img.dtype])
+
+
+def _adjust_contrast_torchvision_uint8(img, factor, mean):
+ lut = np.arange(0, 256) * factor
+ lut = lut + mean * (1 - factor)
+ lut = clip(lut, img.dtype, 255)
+
+ return cv2.LUT(img, lut)
+
+
+@preserve_shape
+def adjust_contrast_torchvision(img, factor):
+ if factor == 1:
+ return img
+
+ if is_grayscale_image(img):
+ mean = img.mean()
+ else:
+ mean = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY).mean()
+
+ if factor == 0:
+ if img.dtype != np.float32:
+ mean = int(mean + 0.5)
+ return np.full_like(img, mean, dtype=img.dtype)
+
+ if img.dtype == np.uint8:
+ return _adjust_contrast_torchvision_uint8(img, factor, mean)
+
+ return clip(
+ img.astype(np.float32) * factor + mean * (1 - factor),
+ img.dtype,
+ MAX_VALUES_BY_DTYPE[img.dtype],
+ )
+
+
+@preserve_shape
+def adjust_saturation_torchvision(img, factor, gamma=0):
+ if factor == 1:
+ return img
+
+ if is_grayscale_image(img):
+ gray = img
+ return gray
+ else:
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+ gray = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
+
+ if factor == 0:
+ return gray
+
+ result = cv2.addWeighted(img, factor, gray, 1 - factor, gamma=gamma)
+ if img.dtype == np.uint8:
+ return result
+
+ # OpenCV does not clip values for float dtype
+ return clip(result, img.dtype, MAX_VALUES_BY_DTYPE[img.dtype])
+
+
+def _adjust_hue_torchvision_uint8(img, factor):
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+
+ lut = np.arange(0, 256, dtype=np.int16)
+ lut = np.mod(lut + 180 * factor, 180).astype(np.uint8)
+ img[..., 0] = cv2.LUT(img[..., 0], lut)
+
+ return cv2.cvtColor(img, cv2.COLOR_HSV2RGB)
+
+
+def adjust_hue_torchvision(img, factor):
+ if is_grayscale_image(img):
+ return img
+
+ if factor == 0:
+ return img
+
+ if img.dtype == np.uint8:
+ return _adjust_hue_torchvision_uint8(img, factor)
+
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+ img[..., 0] = np.mod(img[..., 0] + factor * 360, 360)
+ return cv2.cvtColor(img, cv2.COLOR_HSV2RGB)
+
+
+@preserve_shape
+def superpixels(
+ image: np.ndarray, n_segments: int, replace_samples: Sequence[bool], max_size: Optional[int], interpolation: int
+) -> np.ndarray:
+ if not np.any(replace_samples):
+ return image
+
+ orig_shape = image.shape
+ if max_size is not None:
+ size = max(image.shape[:2])
+ if size > max_size:
+ scale = max_size / size
+ height, width = image.shape[:2]
+ new_height, new_width = int(height * scale), int(width * scale)
+ resize_fn = _maybe_process_in_chunks(cv2.resize, dsize=(new_width, new_height), interpolation=interpolation)
+ image = resize_fn(image)
+
+ segments = skimage.segmentation.slic(
+ image, n_segments=n_segments, compactness=10, channel_axis=-1 if image.ndim > 2 else None
+ )
+
+ min_value = 0
+ max_value = MAX_VALUES_BY_DTYPE[image.dtype]
+ image = np.copy(image)
+ if image.ndim == 2:
+ image = image.reshape(*image.shape, 1)
+ nb_channels = image.shape[2]
+ for c in range(nb_channels):
+ # segments+1 here because otherwise regionprops always misses the last label
+ regions = skimage.measure.regionprops(segments + 1, intensity_image=image[..., c])
+ for ridx, region in enumerate(regions):
+ # with mod here, because slic can sometimes create more superpixel than requested.
+ # replace_samples then does not have enough values, so we just start over with the first one again.
+ if replace_samples[ridx % len(replace_samples)]:
+ mean_intensity = region.mean_intensity
+ image_sp_c = image[..., c]
+
+ if image_sp_c.dtype.kind in ["i", "u", "b"]:
+ # After rounding the value can end up slightly outside of the value_range. Hence, we need to clip.
+ # We do clip via min(max(...)) instead of np.clip because
+ # the latter one does not seem to keep dtypes for dtypes with large itemsizes (e.g. uint64).
+ value: Union[int, float]
+ value = int(np.round(mean_intensity))
+ value = min(max(value, min_value), max_value)
+ else:
+ value = mean_intensity
+
+ image_sp_c[segments == ridx] = value
+
+ if orig_shape != image.shape:
+ resize_fn = _maybe_process_in_chunks(
+ cv2.resize, dsize=(orig_shape[1], orig_shape[0]), interpolation=interpolation
+ )
+ image = resize_fn(image)
+
+ return image
+
+
+@clipped
+def add_weighted(img1, alpha, img2, beta):
+ return img1.astype(float) * alpha + img2.astype(float) * beta
+
+
+@clipped
+@preserve_shape
+def unsharp_mask(image: np.ndarray, ksize: int, sigma: float = 0.0, alpha: float = 0.2, threshold: int = 10):
+ blur_fn = _maybe_process_in_chunks(cv2.GaussianBlur, ksize=(ksize, ksize), sigmaX=sigma)
+
+ input_dtype = image.dtype
+ if input_dtype == np.uint8:
+ image = to_float(image)
+ elif input_dtype not in (np.uint8, np.float32):
+ raise ValueError("Unexpected dtype {} for UnsharpMask augmentation".format(input_dtype))
+
+ blur = blur_fn(image)
+ residual = image - blur
+
+ # Do not sharpen noise
+ mask = np.abs(residual) * 255 > threshold
+ mask = mask.astype("float32")
+
+ sharp = image + alpha * residual
+ # Avoid color noise artefacts.
+ sharp = np.clip(sharp, 0, 1)
+
+ soft_mask = blur_fn(mask)
+ output = soft_mask * sharp + (1 - soft_mask) * image
+ return from_float(output, dtype=input_dtype)
+
+
+@preserve_shape
+def pixel_dropout(image: np.ndarray, drop_mask: np.ndarray, drop_value: Union[float, Sequence[float]]) -> np.ndarray:
+ if isinstance(drop_value, (int, float)) and drop_value == 0:
+ drop_values = np.zeros_like(image)
+ else:
+ drop_values = np.full_like(image, drop_value) # type: ignore
+ return np.where(drop_mask, drop_values, image)
+
+
+@clipped
+@preserve_shape
+def spatter(
+ img: np.ndarray,
+ non_mud: Optional[np.ndarray],
+ mud: Optional[np.ndarray],
+ rain: Optional[np.ndarray],
+ mode: str,
+) -> np.ndarray:
+ non_rgb_warning(img)
+
+ coef = MAX_VALUES_BY_DTYPE[img.dtype]
+ img = img.astype(np.float32) * (1 / coef)
+
+ if mode == "rain":
+ assert rain is not None
+ img = img + rain
+ elif mode == "mud":
+ assert non_mud is not None and mud is not None
+ img = img * non_mud + mud
+ else:
+ raise ValueError("Unsupported spatter mode: " + str(mode))
+
+ return img * 255
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3109eb0d6c3d28d4dddbce4a7aa583a51b450808
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/__init__.py
@@ -0,0 +1,4 @@
+from .functional import *
+from .resize import *
+from .rotate import *
+from .transforms import *
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/functional.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..23d04f5e8b7d50823a7de566f66aa281655a0b8d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/functional.py
@@ -0,0 +1,1300 @@
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+import skimage.transform
+from scipy.ndimage import gaussian_filter
+
+from custom_albumentations.augmentations.utils import (
+ _maybe_process_in_chunks,
+ angle_2pi_range,
+ clipped,
+ preserve_channel_dim,
+ preserve_shape,
+)
+
+from ... import random_utils
+from ...core.bbox_utils import denormalize_bbox, normalize_bbox
+from ...core.transforms_interface import (
+ BoxInternalType,
+ FillValueType,
+ ImageColorType,
+ KeypointInternalType,
+)
+
+__all__ = [
+ "optical_distortion",
+ "elastic_transform_approx",
+ "grid_distortion",
+ "pad",
+ "pad_with_params",
+ "bbox_rot90",
+ "keypoint_rot90",
+ "rotate",
+ "bbox_rotate",
+ "keypoint_rotate",
+ "shift_scale_rotate",
+ "keypoint_shift_scale_rotate",
+ "bbox_shift_scale_rotate",
+ "elastic_transform",
+ "resize",
+ "scale",
+ "keypoint_scale",
+ "py3round",
+ "_func_max_size",
+ "longest_max_size",
+ "smallest_max_size",
+ "perspective",
+ "perspective_bbox",
+ "rotation2DMatrixToEulerAngles",
+ "perspective_keypoint",
+ "_is_identity_matrix",
+ "warp_affine",
+ "keypoint_affine",
+ "bbox_affine",
+ "safe_rotate",
+ "bbox_safe_rotate",
+ "keypoint_safe_rotate",
+ "piecewise_affine",
+ "to_distance_maps",
+ "from_distance_maps",
+ "keypoint_piecewise_affine",
+ "bbox_piecewise_affine",
+ "bbox_flip",
+ "bbox_hflip",
+ "bbox_transpose",
+ "bbox_vflip",
+ "hflip",
+ "hflip_cv2",
+ "transpose",
+ "keypoint_flip",
+ "keypoint_hflip",
+ "keypoint_transpose",
+ "keypoint_vflip",
+]
+
+
+def bbox_rot90(bbox: BoxInternalType, factor: int, rows: int, cols: int) -> BoxInternalType: # skipcq: PYL-W0613
+ """Rotates a bounding box by 90 degrees CCW (see np.rot90)
+
+ Args:
+ bbox: A bounding box tuple (x_min, y_min, x_max, y_max).
+ factor: Number of CCW rotations. Must be in set {0, 1, 2, 3} See np.rot90.
+ rows: Image rows.
+ cols: Image cols.
+
+ Returns:
+ tuple: A bounding box tuple (x_min, y_min, x_max, y_max).
+
+ """
+ if factor not in {0, 1, 2, 3}:
+ raise ValueError("Parameter n must be in set {0, 1, 2, 3}")
+ x_min, y_min, x_max, y_max = bbox[:4]
+ if factor == 1:
+ bbox = y_min, 1 - x_max, y_max, 1 - x_min
+ elif factor == 2:
+ bbox = 1 - x_max, 1 - y_max, 1 - x_min, 1 - y_min
+ elif factor == 3:
+ bbox = 1 - y_max, x_min, 1 - y_min, x_max
+ return bbox
+
+
+@angle_2pi_range
+def keypoint_rot90(keypoint: KeypointInternalType, factor: int, rows: int, cols: int, **params) -> KeypointInternalType:
+ """Rotates a keypoint by 90 degrees CCW (see np.rot90)
+
+ Args:
+ keypoint: A keypoint `(x, y, angle, scale)`.
+ factor: Number of CCW rotations. Must be in range [0;3] See np.rot90.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ tuple: A keypoint `(x, y, angle, scale)`.
+
+ Raises:
+ ValueError: if factor not in set {0, 1, 2, 3}
+
+ """
+ x, y, angle, scale = keypoint[:4]
+
+ if factor not in {0, 1, 2, 3}:
+ raise ValueError("Parameter n must be in set {0, 1, 2, 3}")
+
+ if factor == 1:
+ x, y, angle = y, (cols - 1) - x, angle - math.pi / 2
+ elif factor == 2:
+ x, y, angle = (cols - 1) - x, (rows - 1) - y, angle - math.pi
+ elif factor == 3:
+ x, y, angle = (rows - 1) - y, x, angle + math.pi / 2
+
+ return x, y, angle, scale
+
+
+@preserve_channel_dim
+def rotate(
+ img: np.ndarray,
+ angle: float,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+):
+ height, width = img.shape[:2]
+ # for images we use additional shifts of (0.5, 0.5) as otherwise
+ # we get an ugly black border for 90deg rotations
+ matrix = cv2.getRotationMatrix2D((width / 2 - 0.5, height / 2 - 0.5), angle, 1.0)
+
+ warp_fn = _maybe_process_in_chunks(
+ cv2.warpAffine, M=matrix, dsize=(width, height), flags=interpolation, borderMode=border_mode, borderValue=value
+ )
+ return warp_fn(img)
+
+
+def bbox_rotate(bbox: BoxInternalType, angle: float, method: str, rows: int, cols: int) -> BoxInternalType:
+ """Rotates a bounding box by angle degrees.
+
+ Args:
+ bbox: A bounding box `(x_min, y_min, x_max, y_max)`.
+ angle: Angle of rotation in degrees.
+ method: Rotation method used. Should be one of: "largest_box", "ellipse". Default: "largest_box".
+ rows: Image rows.
+ cols: Image cols.
+
+ Returns:
+ A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ References:
+ https://arxiv.org/abs/2109.13488
+
+ """
+ x_min, y_min, x_max, y_max = bbox[:4]
+ scale = cols / float(rows)
+ if method == "largest_box":
+ x = np.array([x_min, x_max, x_max, x_min]) - 0.5
+ y = np.array([y_min, y_min, y_max, y_max]) - 0.5
+ elif method == "ellipse":
+ w = (x_max - x_min) / 2
+ h = (y_max - y_min) / 2
+ data = np.arange(0, 360, dtype=np.float32)
+ x = w * np.sin(np.radians(data)) + (w + x_min - 0.5)
+ y = h * np.cos(np.radians(data)) + (h + y_min - 0.5)
+ else:
+ raise ValueError(f"Method {method} is not a valid rotation method.")
+ angle = np.deg2rad(angle)
+ x_t = (np.cos(angle) * x * scale + np.sin(angle) * y) / scale
+ y_t = -np.sin(angle) * x * scale + np.cos(angle) * y
+ x_t = x_t + 0.5
+ y_t = y_t + 0.5
+
+ x_min, x_max = min(x_t), max(x_t)
+ y_min, y_max = min(y_t), max(y_t)
+
+ return x_min, y_min, x_max, y_max
+
+
+@angle_2pi_range
+def keypoint_rotate(keypoint, angle, rows, cols, **params):
+ """Rotate a keypoint by angle.
+
+ Args:
+ keypoint (tuple): A keypoint `(x, y, angle, scale)`.
+ angle (float): Rotation angle.
+ rows (int): Image height.
+ cols (int): Image width.
+
+ Returns:
+ tuple: A keypoint `(x, y, angle, scale)`.
+
+ """
+ center = (cols - 1) * 0.5, (rows - 1) * 0.5
+ matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
+ x, y, a, s = keypoint[:4]
+ x, y = cv2.transform(np.array([[[x, y]]]), matrix).squeeze()
+ return x, y, a + math.radians(angle), s
+
+
+@preserve_channel_dim
+def shift_scale_rotate(
+ img, angle, scale, dx, dy, interpolation=cv2.INTER_LINEAR, border_mode=cv2.BORDER_REFLECT_101, value=None
+):
+ height, width = img.shape[:2]
+ # for images we use additional shifts of (0.5, 0.5) as otherwise
+ # we get an ugly black border for 90deg rotations
+ center = (width / 2 - 0.5, height / 2 - 0.5)
+ matrix = cv2.getRotationMatrix2D(center, angle, scale)
+ matrix[0, 2] += dx * width
+ matrix[1, 2] += dy * height
+
+ warp_affine_fn = _maybe_process_in_chunks(
+ cv2.warpAffine, M=matrix, dsize=(width, height), flags=interpolation, borderMode=border_mode, borderValue=value
+ )
+ return warp_affine_fn(img)
+
+
+@angle_2pi_range
+def keypoint_shift_scale_rotate(keypoint, angle, scale, dx, dy, rows, cols, **params):
+ (
+ x,
+ y,
+ a,
+ s,
+ ) = keypoint[:4]
+ height, width = rows, cols
+ center = (cols - 1) * 0.5, (rows - 1) * 0.5
+ matrix = cv2.getRotationMatrix2D(center, angle, scale)
+ matrix[0, 2] += dx * width
+ matrix[1, 2] += dy * height
+
+ x, y = cv2.transform(np.array([[[x, y]]]), matrix).squeeze()
+ angle = a + math.radians(angle)
+ scale = s * scale
+
+ return x, y, angle, scale
+
+
+def bbox_shift_scale_rotate(bbox, angle, scale, dx, dy, rotate_method, rows, cols, **kwargs): # skipcq: PYL-W0613
+ """Rotates, shifts and scales a bounding box. Rotation is made by angle degrees,
+ scaling is made by scale factor and shifting is made by dx and dy.
+
+
+ Args:
+ bbox (tuple): A bounding box `(x_min, y_min, x_max, y_max)`.
+ angle (int): Angle of rotation in degrees.
+ scale (int): Scale factor.
+ dx (int): Shift along x-axis in pixel units.
+ dy (int): Shift along y-axis in pixel units.
+ rotate_method(str): Rotation method used. Should be one of: "largest_box", "ellipse".
+ Default: "largest_box".
+ rows (int): Image rows.
+ cols (int): Image cols.
+
+ Returns:
+ A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ """
+ height, width = rows, cols
+ center = (width / 2, height / 2)
+ if rotate_method == "ellipse":
+ x_min, y_min, x_max, y_max = bbox_rotate(bbox, angle, rotate_method, rows, cols)
+ matrix = cv2.getRotationMatrix2D(center, 0, scale)
+ else:
+ x_min, y_min, x_max, y_max = bbox[:4]
+ matrix = cv2.getRotationMatrix2D(center, angle, scale)
+ matrix[0, 2] += dx * width
+ matrix[1, 2] += dy * height
+ x = np.array([x_min, x_max, x_max, x_min])
+ y = np.array([y_min, y_min, y_max, y_max])
+ ones = np.ones(shape=(len(x)))
+ points_ones = np.vstack([x, y, ones]).transpose()
+ points_ones[:, 0] *= width
+ points_ones[:, 1] *= height
+ tr_points = matrix.dot(points_ones.T).T
+ tr_points[:, 0] /= width
+ tr_points[:, 1] /= height
+
+ x_min, x_max = min(tr_points[:, 0]), max(tr_points[:, 0])
+ y_min, y_max = min(tr_points[:, 1]), max(tr_points[:, 1])
+
+ return x_min, y_min, x_max, y_max
+
+
+@preserve_shape
+def elastic_transform(
+ img: np.ndarray,
+ alpha: float,
+ sigma: float,
+ alpha_affine: float,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+ random_state: Optional[np.random.RandomState] = None,
+ approximate: bool = False,
+ same_dxdy: bool = False,
+):
+ """Elastic deformation of images as described in [Simard2003]_ (with modifications).
+ Based on https://gist.github.com/ernestum/601cdf56d2b424757de5
+
+ .. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
+ Convolutional Neural Networks applied to Visual Document Analysis", in
+ Proc. of the International Conference on Document Analysis and
+ Recognition, 2003.
+ """
+ height, width = img.shape[:2]
+
+ # Random affine
+ center_square = np.array((height, width), dtype=np.float32) // 2
+ square_size = min((height, width)) // 3
+ alpha = float(alpha)
+ sigma = float(sigma)
+ alpha_affine = float(alpha_affine)
+
+ pts1 = np.array(
+ [
+ center_square + square_size,
+ [center_square[0] + square_size, center_square[1] - square_size],
+ center_square - square_size,
+ ],
+ dtype=np.float32,
+ )
+ pts2 = pts1 + random_utils.uniform(-alpha_affine, alpha_affine, size=pts1.shape, random_state=random_state).astype(
+ np.float32
+ )
+ matrix = cv2.getAffineTransform(pts1, pts2)
+
+ warp_fn = _maybe_process_in_chunks(
+ cv2.warpAffine, M=matrix, dsize=(width, height), flags=interpolation, borderMode=border_mode, borderValue=value
+ )
+ img = warp_fn(img)
+
+ if approximate:
+ # Approximate computation smooth displacement map with a large enough kernel.
+ # On large images (512+) this is approximately 2X times faster
+ dx = random_utils.rand(height, width, random_state=random_state).astype(np.float32) * 2 - 1
+ cv2.GaussianBlur(dx, (17, 17), sigma, dst=dx)
+ dx *= alpha
+ if same_dxdy:
+ # Speed up even more
+ dy = dx
+ else:
+ dy = random_utils.rand(height, width, random_state=random_state).astype(np.float32) * 2 - 1
+ cv2.GaussianBlur(dy, (17, 17), sigma, dst=dy)
+ dy *= alpha
+ else:
+ dx = np.float32(
+ gaussian_filter((random_utils.rand(height, width, random_state=random_state) * 2 - 1), sigma) * alpha
+ )
+ if same_dxdy:
+ # Speed up
+ dy = dx
+ else:
+ dy = np.float32(
+ gaussian_filter((random_utils.rand(height, width, random_state=random_state) * 2 - 1), sigma) * alpha
+ )
+
+ x, y = np.meshgrid(np.arange(width), np.arange(height))
+
+ map_x = np.float32(x + dx)
+ map_y = np.float32(y + dy)
+
+ remap_fn = _maybe_process_in_chunks(
+ cv2.remap, map1=map_x, map2=map_y, interpolation=interpolation, borderMode=border_mode, borderValue=value
+ )
+ return remap_fn(img)
+
+
+@preserve_channel_dim
+def resize(img, height, width, interpolation=cv2.INTER_LINEAR):
+ img_height, img_width = img.shape[:2]
+ if height == img_height and width == img_width:
+ return img
+ resize_fn = _maybe_process_in_chunks(cv2.resize, dsize=(width, height), interpolation=interpolation)
+ return resize_fn(img)
+
+
+@preserve_channel_dim
+def scale(img: np.ndarray, scale: float, interpolation: int = cv2.INTER_LINEAR) -> np.ndarray:
+ height, width = img.shape[:2]
+ new_height, new_width = int(height * scale), int(width * scale)
+ return resize(img, new_height, new_width, interpolation)
+
+
+def keypoint_scale(keypoint: KeypointInternalType, scale_x: float, scale_y: float) -> KeypointInternalType:
+ """Scales a keypoint by scale_x and scale_y.
+
+ Args:
+ keypoint: A keypoint `(x, y, angle, scale)`.
+ scale_x: Scale coefficient x-axis.
+ scale_y: Scale coefficient y-axis.
+
+ Returns:
+ A keypoint `(x, y, angle, scale)`.
+
+ """
+ x, y, angle, scale = keypoint[:4]
+ return x * scale_x, y * scale_y, angle, scale * max(scale_x, scale_y)
+
+
+def py3round(number):
+ """Unified rounding in all python versions."""
+ if abs(round(number) - number) == 0.5:
+ return int(2.0 * round(number / 2.0))
+
+ return int(round(number))
+
+
+def _func_max_size(img, max_size, interpolation, func):
+ height, width = img.shape[:2]
+
+ scale = max_size / float(func(width, height))
+
+ if scale != 1.0:
+ new_height, new_width = tuple(py3round(dim * scale) for dim in (height, width))
+ img = resize(img, height=new_height, width=new_width, interpolation=interpolation)
+ return img
+
+
+@preserve_channel_dim
+def longest_max_size(img: np.ndarray, max_size: int, interpolation: int) -> np.ndarray:
+ return _func_max_size(img, max_size, interpolation, max)
+
+
+@preserve_channel_dim
+def smallest_max_size(img: np.ndarray, max_size: int, interpolation: int) -> np.ndarray:
+ return _func_max_size(img, max_size, interpolation, min)
+
+
+@preserve_channel_dim
+def perspective(
+ img: np.ndarray,
+ matrix: np.ndarray,
+ max_width: int,
+ max_height: int,
+ border_val: Union[int, float, List[int], List[float], np.ndarray],
+ border_mode: int,
+ keep_size: bool,
+ interpolation: int,
+):
+ h, w = img.shape[:2]
+ perspective_func = _maybe_process_in_chunks(
+ cv2.warpPerspective,
+ M=matrix,
+ dsize=(max_width, max_height),
+ borderMode=border_mode,
+ borderValue=border_val,
+ flags=interpolation,
+ )
+ warped = perspective_func(img)
+
+ if keep_size:
+ return resize(warped, h, w, interpolation=interpolation)
+
+ return warped
+
+
+def perspective_bbox(
+ bbox: BoxInternalType,
+ height: int,
+ width: int,
+ matrix: np.ndarray,
+ max_width: int,
+ max_height: int,
+ keep_size: bool,
+) -> BoxInternalType:
+ x1, y1, x2, y2 = denormalize_bbox(bbox, height, width)[:4]
+
+ points = np.array([[x1, y1], [x2, y1], [x2, y2], [x1, y2]], dtype=np.float32)
+
+ x1, y1, x2, y2 = float("inf"), float("inf"), 0, 0
+ for pt in points:
+ pt = perspective_keypoint(pt.tolist() + [0, 0], height, width, matrix, max_width, max_height, keep_size)
+ x, y = pt[:2]
+ x1 = min(x1, x)
+ x2 = max(x2, x)
+ y1 = min(y1, y)
+ y2 = max(y2, y)
+
+ return normalize_bbox((x1, y1, x2, y2), height if keep_size else max_height, width if keep_size else max_width)
+
+
+def rotation2DMatrixToEulerAngles(matrix: np.ndarray, y_up: bool = False) -> float:
+ """
+ Args:
+ matrix (np.ndarray): Rotation matrix
+ y_up (bool): is Y axis looks up or down
+ """
+ if y_up:
+ return np.arctan2(matrix[1, 0], matrix[0, 0])
+ return np.arctan2(-matrix[1, 0], matrix[0, 0])
+
+
+@angle_2pi_range
+def perspective_keypoint(
+ keypoint: KeypointInternalType,
+ height: int,
+ width: int,
+ matrix: np.ndarray,
+ max_width: int,
+ max_height: int,
+ keep_size: bool,
+) -> KeypointInternalType:
+ x, y, angle, scale = keypoint
+
+ keypoint_vector = np.array([x, y], dtype=np.float32).reshape([1, 1, 2])
+
+ x, y = cv2.perspectiveTransform(keypoint_vector, matrix)[0, 0]
+ angle += rotation2DMatrixToEulerAngles(matrix[:2, :2], y_up=True)
+
+ scale_x = np.sign(matrix[0, 0]) * np.sqrt(matrix[0, 0] ** 2 + matrix[0, 1] ** 2)
+ scale_y = np.sign(matrix[1, 1]) * np.sqrt(matrix[1, 0] ** 2 + matrix[1, 1] ** 2)
+ scale *= max(scale_x, scale_y)
+
+ if keep_size:
+ scale_x = width / max_width
+ scale_y = height / max_height
+ return keypoint_scale((x, y, angle, scale), scale_x, scale_y)
+
+ return x, y, angle, scale
+
+
+def _is_identity_matrix(matrix: skimage.transform.ProjectiveTransform) -> bool:
+ return np.allclose(matrix.params, np.eye(3, dtype=np.float32))
+
+
+@preserve_channel_dim
+def warp_affine(
+ image: np.ndarray,
+ matrix: skimage.transform.ProjectiveTransform,
+ interpolation: int,
+ cval: Union[int, float, Sequence[int], Sequence[float]],
+ mode: int,
+ output_shape: Sequence[int],
+) -> np.ndarray:
+ if _is_identity_matrix(matrix):
+ return image
+
+ dsize = int(np.round(output_shape[1])), int(np.round(output_shape[0]))
+ warp_fn = _maybe_process_in_chunks(
+ cv2.warpAffine, M=matrix.params[:2], dsize=dsize, flags=interpolation, borderMode=mode, borderValue=cval
+ )
+ tmp = warp_fn(image)
+ return tmp
+
+
+@angle_2pi_range
+def keypoint_affine(
+ keypoint: KeypointInternalType,
+ matrix: skimage.transform.ProjectiveTransform,
+ scale: dict,
+) -> KeypointInternalType:
+ if _is_identity_matrix(matrix):
+ return keypoint
+
+ x, y, a, s = keypoint[:4]
+ x, y = cv2.transform(np.array([[[x, y]]]), matrix.params[:2]).squeeze()
+ a += rotation2DMatrixToEulerAngles(matrix.params[:2])
+ s *= np.max([scale["x"], scale["y"]])
+ return x, y, a, s
+
+
+def bbox_affine(
+ bbox: BoxInternalType,
+ matrix: skimage.transform.ProjectiveTransform,
+ rotate_method: str,
+ rows: int,
+ cols: int,
+ output_shape: Sequence[int],
+) -> BoxInternalType:
+ if _is_identity_matrix(matrix):
+ return bbox
+ x_min, y_min, x_max, y_max = denormalize_bbox(bbox, rows, cols)[:4]
+ if rotate_method == "largest_box":
+ points = np.array(
+ [
+ [x_min, y_min],
+ [x_max, y_min],
+ [x_max, y_max],
+ [x_min, y_max],
+ ]
+ )
+ elif rotate_method == "ellipse":
+ w = (x_max - x_min) / 2
+ h = (y_max - y_min) / 2
+ data = np.arange(0, 360, dtype=np.float32)
+ x = w * np.sin(np.radians(data)) + (w + x_min - 0.5)
+ y = h * np.cos(np.radians(data)) + (h + y_min - 0.5)
+ points = np.hstack([x.reshape(-1, 1), y.reshape(-1, 1)])
+ else:
+ raise ValueError(f"Method {rotate_method} is not a valid rotation method.")
+ points = skimage.transform.matrix_transform(points, matrix.params)
+ x_min = np.min(points[:, 0])
+ x_max = np.max(points[:, 0])
+ y_min = np.min(points[:, 1])
+ y_max = np.max(points[:, 1])
+
+ return normalize_bbox((x_min, y_min, x_max, y_max), output_shape[0], output_shape[1])
+
+
+@preserve_channel_dim
+def safe_rotate(
+ img: np.ndarray,
+ matrix: np.ndarray,
+ interpolation: int,
+ value: FillValueType = None,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+) -> np.ndarray:
+ h, w = img.shape[:2]
+ warp_fn = _maybe_process_in_chunks(
+ cv2.warpAffine,
+ M=matrix,
+ dsize=(w, h),
+ flags=interpolation,
+ borderMode=border_mode,
+ borderValue=value,
+ )
+ return warp_fn(img)
+
+
+def bbox_safe_rotate(bbox: BoxInternalType, matrix: np.ndarray, cols: int, rows: int) -> BoxInternalType:
+ x1, y1, x2, y2 = denormalize_bbox(bbox, rows, cols)[:4]
+ points = np.array(
+ [
+ [x1, y1, 1],
+ [x2, y1, 1],
+ [x2, y2, 1],
+ [x1, y2, 1],
+ ]
+ )
+ points = points @ matrix.T
+ x1 = points[:, 0].min()
+ x2 = points[:, 0].max()
+ y1 = points[:, 1].min()
+ y2 = points[:, 1].max()
+
+ def fix_point(pt1: float, pt2: float, max_val: float) -> Tuple[float, float]:
+ # In my opinion, these errors should be very low, around 1-2 pixels.
+ if pt1 < 0:
+ return 0, pt2 + pt1
+ if pt2 > max_val:
+ return pt1 - (pt2 - max_val), max_val
+ return pt1, pt2
+
+ x1, x2 = fix_point(x1, x2, cols)
+ y1, y2 = fix_point(y1, y2, rows)
+
+ return normalize_bbox((x1, y1, x2, y2), rows, cols)
+
+
+def keypoint_safe_rotate(
+ keypoint: KeypointInternalType,
+ matrix: np.ndarray,
+ angle: float,
+ scale_x: float,
+ scale_y: float,
+ cols: int,
+ rows: int,
+) -> KeypointInternalType:
+ x, y, a, s = keypoint[:4]
+ point = np.array([[x, y, 1]])
+ x, y = (point @ matrix.T)[0]
+
+ # To avoid problems with float errors
+ x = np.clip(x, 0, cols - 1)
+ y = np.clip(y, 0, rows - 1)
+
+ a += angle
+ s *= max(scale_x, scale_y)
+ return x, y, a, s
+
+
+@clipped
+def piecewise_affine(
+ img: np.ndarray,
+ matrix: Optional[skimage.transform.PiecewiseAffineTransform],
+ interpolation: int,
+ mode: str,
+ cval: float,
+) -> np.ndarray:
+ if matrix is None:
+ return img
+ return skimage.transform.warp(
+ img, matrix, order=interpolation, mode=mode, cval=cval, preserve_range=True, output_shape=img.shape
+ )
+
+
+def to_distance_maps(
+ keypoints: Sequence[Tuple[float, float]], height: int, width: int, inverted: bool = False
+) -> np.ndarray:
+ """Generate a ``(H,W,N)`` array of distance maps for ``N`` keypoints.
+
+ The ``n``-th distance map contains at every location ``(y, x)`` the
+ euclidean distance to the ``n``-th keypoint.
+
+ This function can be used as a helper when augmenting keypoints with a
+ method that only supports the augmentation of images.
+
+ Args:
+ keypoint: keypoint coordinates
+ height: image height
+ width: image width
+ inverted (bool): If ``True``, inverted distance maps are returned where each
+ distance value d is replaced by ``d/(d+1)``, i.e. the distance
+ maps have values in the range ``(0.0, 1.0]`` with ``1.0`` denoting
+ exactly the position of the respective keypoint.
+
+ Returns:
+ (H, W, N) ndarray
+ A ``float32`` array containing ``N`` distance maps for ``N``
+ keypoints. Each location ``(y, x, n)`` in the array denotes the
+ euclidean distance at ``(y, x)`` to the ``n``-th keypoint.
+ If `inverted` is ``True``, the distance ``d`` is replaced
+ by ``d/(d+1)``. The height and width of the array match the
+ height and width in ``KeypointsOnImage.shape``.
+ """
+ distance_maps = np.zeros((height, width, len(keypoints)), dtype=np.float32)
+
+ yy = np.arange(0, height)
+ xx = np.arange(0, width)
+ grid_xx, grid_yy = np.meshgrid(xx, yy)
+
+ for i, (x, y) in enumerate(keypoints):
+ distance_maps[:, :, i] = (grid_xx - x) ** 2 + (grid_yy - y) ** 2
+
+ distance_maps = np.sqrt(distance_maps)
+ if inverted:
+ return 1 / (distance_maps + 1)
+ return distance_maps
+
+
+def from_distance_maps(
+ distance_maps: np.ndarray,
+ inverted: bool,
+ if_not_found_coords: Optional[Union[Sequence[int], dict]],
+ threshold: Optional[float] = None,
+) -> List[Tuple[float, float]]:
+ """Convert outputs of ``to_distance_maps()`` to ``KeypointsOnImage``.
+ This is the inverse of `to_distance_maps`.
+
+ Args:
+ distance_maps (np.ndarray): The distance maps. ``N`` is the number of keypoints.
+ inverted (bool): Whether the given distance maps were generated in inverted mode
+ (i.e. :func:`KeypointsOnImage.to_distance_maps` was called with ``inverted=True``) or in non-inverted mode.
+ if_not_found_coords (tuple, list, dict or None, optional):
+ Coordinates to use for keypoints that cannot be found in `distance_maps`.
+
+ * If this is a ``list``/``tuple``, it must contain two ``int`` values.
+ * If it is a ``dict``, it must contain the keys ``x`` and ``y`` with each containing one ``int`` value.
+ * If this is ``None``, then the keypoint will not be added.
+ threshold (float): The search for keypoints works by searching for the
+ argmin (non-inverted) or argmax (inverted) in each channel. This
+ parameters contains the maximum (non-inverted) or minimum (inverted) value to accept in order to view a hit
+ as a keypoint. Use ``None`` to use no min/max.
+ nb_channels (None, int): Number of channels of the image on which the keypoints are placed.
+ Some keypoint augmenters require that information. If set to ``None``, the keypoint's shape will be set
+ to ``(height, width)``, otherwise ``(height, width, nb_channels)``.
+ """
+ if distance_maps.ndim != 3:
+ raise ValueError(
+ f"Expected three-dimensional input, "
+ f"got {distance_maps.ndim} dimensions and shape {distance_maps.shape}."
+ )
+ height, width, nb_keypoints = distance_maps.shape
+
+ drop_if_not_found = False
+ if if_not_found_coords is None:
+ drop_if_not_found = True
+ if_not_found_x = -1
+ if_not_found_y = -1
+ elif isinstance(if_not_found_coords, (tuple, list)):
+ if len(if_not_found_coords) != 2:
+ raise ValueError(
+ f"Expected tuple/list 'if_not_found_coords' to contain exactly two entries, "
+ f"got {len(if_not_found_coords)}."
+ )
+ if_not_found_x = if_not_found_coords[0]
+ if_not_found_y = if_not_found_coords[1]
+ elif isinstance(if_not_found_coords, dict):
+ if_not_found_x = if_not_found_coords["x"]
+ if_not_found_y = if_not_found_coords["y"]
+ else:
+ raise ValueError(
+ f"Expected if_not_found_coords to be None or tuple or list or dict, got {type(if_not_found_coords)}."
+ )
+
+ keypoints = []
+ for i in range(nb_keypoints):
+ if inverted:
+ hitidx_flat = np.argmax(distance_maps[..., i])
+ else:
+ hitidx_flat = np.argmin(distance_maps[..., i])
+ hitidx_ndim = np.unravel_index(hitidx_flat, (height, width))
+ if not inverted and threshold is not None:
+ found = distance_maps[hitidx_ndim[0], hitidx_ndim[1], i] < threshold
+ elif inverted and threshold is not None:
+ found = distance_maps[hitidx_ndim[0], hitidx_ndim[1], i] >= threshold
+ else:
+ found = True
+ if found:
+ keypoints.append((float(hitidx_ndim[1]), float(hitidx_ndim[0])))
+ else:
+ if not drop_if_not_found:
+ keypoints.append((if_not_found_x, if_not_found_y))
+
+ return keypoints
+
+
+def keypoint_piecewise_affine(
+ keypoint: KeypointInternalType,
+ matrix: Optional[skimage.transform.PiecewiseAffineTransform],
+ h: int,
+ w: int,
+ keypoints_threshold: float,
+) -> KeypointInternalType:
+ if matrix is None:
+ return keypoint
+ x, y, a, s = keypoint[:4]
+ dist_maps = to_distance_maps([(x, y)], h, w, True)
+ dist_maps = piecewise_affine(dist_maps, matrix, 0, "constant", 0)
+ x, y = from_distance_maps(dist_maps, True, {"x": -1, "y": -1}, keypoints_threshold)[0]
+ return x, y, a, s
+
+
+def bbox_piecewise_affine(
+ bbox: BoxInternalType,
+ matrix: Optional[skimage.transform.PiecewiseAffineTransform],
+ h: int,
+ w: int,
+ keypoints_threshold: float,
+) -> BoxInternalType:
+ if matrix is None:
+ return bbox
+ x1, y1, x2, y2 = denormalize_bbox(bbox, h, w)[:4]
+ keypoints = [
+ (x1, y1),
+ (x2, y1),
+ (x2, y2),
+ (x1, y2),
+ ]
+ dist_maps = to_distance_maps(keypoints, h, w, True)
+ dist_maps = piecewise_affine(dist_maps, matrix, 0, "constant", 0)
+ keypoints = from_distance_maps(dist_maps, True, {"x": -1, "y": -1}, keypoints_threshold)
+ keypoints = [i for i in keypoints if 0 <= i[0] < w and 0 <= i[1] < h]
+ keypoints_arr = np.array(keypoints)
+ x1 = keypoints_arr[:, 0].min()
+ y1 = keypoints_arr[:, 1].min()
+ x2 = keypoints_arr[:, 0].max()
+ y2 = keypoints_arr[:, 1].max()
+ return normalize_bbox((x1, y1, x2, y2), h, w)
+
+
+def vflip(img: np.ndarray) -> np.ndarray:
+ return np.ascontiguousarray(img[::-1, ...])
+
+
+def hflip(img: np.ndarray) -> np.ndarray:
+ return np.ascontiguousarray(img[:, ::-1, ...])
+
+
+def hflip_cv2(img: np.ndarray) -> np.ndarray:
+ return cv2.flip(img, 1)
+
+
+@preserve_shape
+def random_flip(img: np.ndarray, code: int) -> np.ndarray:
+ return cv2.flip(img, code)
+
+
+def transpose(img: np.ndarray) -> np.ndarray:
+ return img.transpose(1, 0, 2) if len(img.shape) > 2 else img.transpose(1, 0)
+
+
+def rot90(img: np.ndarray, factor: int) -> np.ndarray:
+ img = np.rot90(img, factor)
+ return np.ascontiguousarray(img)
+
+
+def bbox_vflip(bbox: BoxInternalType, rows: int, cols: int) -> BoxInternalType: # skipcq: PYL-W0613
+ """Flip a bounding box vertically around the x-axis.
+
+ Args:
+ bbox: A bounding box `(x_min, y_min, x_max, y_max)`.
+ rows: Image rows.
+ cols: Image cols.
+
+ Returns:
+ tuple: A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ """
+ x_min, y_min, x_max, y_max = bbox[:4]
+ return x_min, 1 - y_max, x_max, 1 - y_min
+
+
+def bbox_hflip(bbox: BoxInternalType, rows: int, cols: int) -> BoxInternalType: # skipcq: PYL-W0613
+ """Flip a bounding box horizontally around the y-axis.
+
+ Args:
+ bbox: A bounding box `(x_min, y_min, x_max, y_max)`.
+ rows: Image rows.
+ cols: Image cols.
+
+ Returns:
+ A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ """
+ x_min, y_min, x_max, y_max = bbox[:4]
+ return 1 - x_max, y_min, 1 - x_min, y_max
+
+
+def bbox_flip(bbox: BoxInternalType, d: int, rows: int, cols: int) -> BoxInternalType:
+ """Flip a bounding box either vertically, horizontally or both depending on the value of `d`.
+
+ Args:
+ bbox: A bounding box `(x_min, y_min, x_max, y_max)`.
+ d: dimension. 0 for vertical flip, 1 for horizontal, -1 for transpose
+ rows: Image rows.
+ cols: Image cols.
+
+ Returns:
+ A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ Raises:
+ ValueError: if value of `d` is not -1, 0 or 1.
+
+ """
+ if d == 0:
+ bbox = bbox_vflip(bbox, rows, cols)
+ elif d == 1:
+ bbox = bbox_hflip(bbox, rows, cols)
+ elif d == -1:
+ bbox = bbox_hflip(bbox, rows, cols)
+ bbox = bbox_vflip(bbox, rows, cols)
+ else:
+ raise ValueError("Invalid d value {}. Valid values are -1, 0 and 1".format(d))
+ return bbox
+
+
+def bbox_transpose(
+ bbox: KeypointInternalType, axis: int, rows: int, cols: int
+) -> KeypointInternalType: # skipcq: PYL-W0613
+ """Transposes a bounding box along given axis.
+
+ Args:
+ bbox: A bounding box `(x_min, y_min, x_max, y_max)`.
+ axis: 0 - main axis, 1 - secondary axis.
+ rows: Image rows.
+ cols: Image cols.
+
+ Returns:
+ A bounding box tuple `(x_min, y_min, x_max, y_max)`.
+
+ Raises:
+ ValueError: If axis not equal to 0 or 1.
+
+ """
+ x_min, y_min, x_max, y_max = bbox[:4]
+ if axis not in {0, 1}:
+ raise ValueError("Axis must be either 0 or 1.")
+ if axis == 0:
+ bbox = (y_min, x_min, y_max, x_max)
+ if axis == 1:
+ bbox = (1 - y_max, 1 - x_max, 1 - y_min, 1 - x_min)
+ return bbox
+
+
+@angle_2pi_range
+def keypoint_vflip(keypoint: KeypointInternalType, rows: int, cols: int) -> KeypointInternalType:
+ """Flip a keypoint vertically around the x-axis.
+
+ Args:
+ keypoint: A keypoint `(x, y, angle, scale)`.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ tuple: A keypoint `(x, y, angle, scale)`.
+
+ """
+ x, y, angle, scale = keypoint[:4]
+ angle = -angle
+ return x, (rows - 1) - y, angle, scale
+
+
+@angle_2pi_range
+def keypoint_hflip(keypoint: KeypointInternalType, rows: int, cols: int) -> KeypointInternalType:
+ """Flip a keypoint horizontally around the y-axis.
+
+ Args:
+ keypoint: A keypoint `(x, y, angle, scale)`.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ A keypoint `(x, y, angle, scale)`.
+
+ """
+ x, y, angle, scale = keypoint[:4]
+ angle = math.pi - angle
+ return (cols - 1) - x, y, angle, scale
+
+
+def keypoint_flip(keypoint: KeypointInternalType, d: int, rows: int, cols: int) -> KeypointInternalType:
+ """Flip a keypoint either vertically, horizontally or both depending on the value of `d`.
+
+ Args:
+ keypoint: A keypoint `(x, y, angle, scale)`.
+ d: Number of flip. Must be -1, 0 or 1:
+ * 0 - vertical flip,
+ * 1 - horizontal flip,
+ * -1 - vertical and horizontal flip.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ A keypoint `(x, y, angle, scale)`.
+
+ Raises:
+ ValueError: if value of `d` is not -1, 0 or 1.
+
+ """
+ if d == 0:
+ keypoint = keypoint_vflip(keypoint, rows, cols)
+ elif d == 1:
+ keypoint = keypoint_hflip(keypoint, rows, cols)
+ elif d == -1:
+ keypoint = keypoint_hflip(keypoint, rows, cols)
+ keypoint = keypoint_vflip(keypoint, rows, cols)
+ else:
+ raise ValueError(f"Invalid d value {d}. Valid values are -1, 0 and 1")
+ return keypoint
+
+
+def keypoint_transpose(keypoint: KeypointInternalType) -> KeypointInternalType:
+ """Rotate a keypoint by angle.
+
+ Args:
+ keypoint: A keypoint `(x, y, angle, scale)`.
+
+ Returns:
+ A keypoint `(x, y, angle, scale)`.
+
+ """
+ x, y, angle, scale = keypoint[:4]
+
+ if angle <= np.pi:
+ angle = np.pi - angle
+ else:
+ angle = 3 * np.pi - angle
+
+ return y, x, angle, scale
+
+
+@preserve_channel_dim
+def pad(
+ img: np.ndarray,
+ min_height: int,
+ min_width: int,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+) -> np.ndarray:
+ height, width = img.shape[:2]
+
+ if height < min_height:
+ h_pad_top = int((min_height - height) / 2.0)
+ h_pad_bottom = min_height - height - h_pad_top
+ else:
+ h_pad_top = 0
+ h_pad_bottom = 0
+
+ if width < min_width:
+ w_pad_left = int((min_width - width) / 2.0)
+ w_pad_right = min_width - width - w_pad_left
+ else:
+ w_pad_left = 0
+ w_pad_right = 0
+
+ img = pad_with_params(img, h_pad_top, h_pad_bottom, w_pad_left, w_pad_right, border_mode, value)
+
+ if img.shape[:2] != (max(min_height, height), max(min_width, width)):
+ raise RuntimeError(
+ "Invalid result shape. Got: {}. Expected: {}".format(
+ img.shape[:2], (max(min_height, height), max(min_width, width))
+ )
+ )
+
+ return img
+
+
+@preserve_channel_dim
+def pad_with_params(
+ img: np.ndarray,
+ h_pad_top: int,
+ h_pad_bottom: int,
+ w_pad_left: int,
+ w_pad_right: int,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+) -> np.ndarray:
+ pad_fn = _maybe_process_in_chunks(
+ cv2.copyMakeBorder,
+ top=h_pad_top,
+ bottom=h_pad_bottom,
+ left=w_pad_left,
+ right=w_pad_right,
+ borderType=border_mode,
+ value=value,
+ )
+ return pad_fn(img)
+
+
+@preserve_shape
+def optical_distortion(
+ img: np.ndarray,
+ k: int = 0,
+ dx: int = 0,
+ dy: int = 0,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+) -> np.ndarray:
+ """Barrel / pincushion distortion. Unconventional augment.
+
+ Reference:
+ | https://stackoverflow.com/questions/6199636/formulas-for-barrel-pincushion-distortion
+ | https://stackoverflow.com/questions/10364201/image-transformation-in-opencv
+ | https://stackoverflow.com/questions/2477774/correcting-fisheye-distortion-programmatically
+ | http://www.coldvision.io/2017/03/02/advanced-lane-finding-using-opencv/
+ """
+ height, width = img.shape[:2]
+
+ fx = width
+ fy = height
+
+ cx = width * 0.5 + dx
+ cy = height * 0.5 + dy
+
+ camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+
+ distortion = np.array([k, k, 0, 0, 0], dtype=np.float32)
+ map1, map2 = cv2.initUndistortRectifyMap(
+ camera_matrix, distortion, None, None, (width, height), cv2.CV_32FC1 # type: ignore[attr-defined]
+ )
+ return cv2.remap(img, map1, map2, interpolation=interpolation, borderMode=border_mode, borderValue=value)
+
+
+@preserve_shape
+def grid_distortion(
+ img: np.ndarray,
+ num_steps: int = 10,
+ xsteps: Tuple = (),
+ ysteps: Tuple = (),
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+) -> np.ndarray:
+ """Perform a grid distortion of an input image.
+
+ Reference:
+ http://pythology.blogspot.sg/2014/03/interpolation-on-regular-distorted-grid.html
+ """
+ height, width = img.shape[:2]
+
+ x_step = width // num_steps
+ xx = np.zeros(width, np.float32)
+ prev = 0
+ for idx in range(num_steps + 1):
+ x = idx * x_step
+ start = int(x)
+ end = int(x) + x_step
+ if end > width:
+ end = width
+ cur = width
+ else:
+ cur = prev + x_step * xsteps[idx]
+
+ xx[start:end] = np.linspace(prev, cur, end - start)
+ prev = cur
+
+ y_step = height // num_steps
+ yy = np.zeros(height, np.float32)
+ prev = 0
+ for idx in range(num_steps + 1):
+ y = idx * y_step
+ start = int(y)
+ end = int(y) + y_step
+ if end > height:
+ end = height
+ cur = height
+ else:
+ cur = prev + y_step * ysteps[idx]
+
+ yy[start:end] = np.linspace(prev, cur, end - start)
+ prev = cur
+
+ map_x, map_y = np.meshgrid(xx, yy)
+ map_x = map_x.astype(np.float32)
+ map_y = map_y.astype(np.float32)
+
+ remap_fn = _maybe_process_in_chunks(
+ cv2.remap,
+ map1=map_x,
+ map2=map_y,
+ interpolation=interpolation,
+ borderMode=border_mode,
+ borderValue=value,
+ )
+ return remap_fn(img)
+
+
+@preserve_shape
+def elastic_transform_approx(
+ img: np.ndarray,
+ alpha: float,
+ sigma: float,
+ alpha_affine: float,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+ random_state: Optional[np.random.RandomState] = None,
+) -> np.ndarray:
+ """Elastic deformation of images as described in [Simard2003]_ (with modifications for speed).
+ Based on https://gist.github.com/ernestum/601cdf56d2b424757de5
+
+ .. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
+ Convolutional Neural Networks applied to Visual Document Analysis", in
+ Proc. of the International Conference on Document Analysis and
+ Recognition, 2003.
+ """
+ height, width = img.shape[:2]
+
+ # Random affine
+ center_square = np.array((height, width), dtype=np.float32) // 2
+ square_size = min((height, width)) // 3
+ alpha = float(alpha)
+ sigma = float(sigma)
+ alpha_affine = float(alpha_affine)
+
+ pts1 = np.array(
+ [
+ center_square + square_size,
+ [center_square[0] + square_size, center_square[1] - square_size],
+ center_square - square_size,
+ ],
+ dtype=np.float32,
+ )
+ pts2 = pts1 + random_utils.uniform(-alpha_affine, alpha_affine, size=pts1.shape, random_state=random_state).astype(
+ np.float32
+ )
+ matrix = cv2.getAffineTransform(pts1, pts2)
+
+ warp_fn = _maybe_process_in_chunks(
+ cv2.warpAffine,
+ M=matrix,
+ dsize=(width, height),
+ flags=interpolation,
+ borderMode=border_mode,
+ borderValue=value,
+ )
+ img = warp_fn(img)
+
+ dx = random_utils.rand(height, width, random_state=random_state).astype(np.float32) * 2 - 1
+ cv2.GaussianBlur(dx, (17, 17), sigma, dst=dx)
+ dx *= alpha
+
+ dy = random_utils.rand(height, width, random_state=random_state).astype(np.float32) * 2 - 1
+ cv2.GaussianBlur(dy, (17, 17), sigma, dst=dy)
+ dy *= alpha
+
+ x, y = np.meshgrid(np.arange(width), np.arange(height))
+
+ map_x = np.float32(x + dx)
+ map_y = np.float32(y + dy)
+
+ remap_fn = _maybe_process_in_chunks(
+ cv2.remap,
+ map1=map_x,
+ map2=map_y,
+ interpolation=interpolation,
+ borderMode=border_mode,
+ borderValue=value,
+ )
+ return remap_fn(img)
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/resize.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..81fc4bbe8ca0748ef07ebf79c807d6bf0ca497f2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/resize.py
@@ -0,0 +1,198 @@
+import random
+from typing import Dict, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+
+from ...core.transforms_interface import (
+ BoxInternalType,
+ DualTransform,
+ KeypointInternalType,
+ to_tuple,
+)
+from . import functional as F
+
+__all__ = ["RandomScale", "LongestMaxSize", "SmallestMaxSize", "Resize"]
+
+
+class RandomScale(DualTransform):
+ """Randomly resize the input. Output image size is different from the input image size.
+
+ Args:
+ scale_limit ((float, float) or float): scaling factor range. If scale_limit is a single float value, the
+ range will be (-scale_limit, scale_limit). Note that the scale_limit will be biased by 1.
+ If scale_limit is a tuple, like (low, high), sampling will be done from the range (1 + low, 1 + high).
+ Default: (-0.1, 0.1).
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, scale_limit=0.1, interpolation=cv2.INTER_LINEAR, always_apply=False, p=0.5):
+ super(RandomScale, self).__init__(always_apply, p)
+ self.scale_limit = to_tuple(scale_limit, bias=1.0)
+ self.interpolation = interpolation
+
+ def get_params(self):
+ return {"scale": random.uniform(self.scale_limit[0], self.scale_limit[1])}
+
+ def apply(self, img, scale=0, interpolation=cv2.INTER_LINEAR, **params):
+ return F.scale(img, scale, interpolation)
+
+ def apply_to_bbox(self, bbox, **params):
+ # Bounding box coordinates are scale invariant
+ return bbox
+
+ def apply_to_keypoint(self, keypoint, scale=0, **params):
+ return F.keypoint_scale(keypoint, scale, scale)
+
+ def get_transform_init_args(self):
+ return {"interpolation": self.interpolation, "scale_limit": to_tuple(self.scale_limit, bias=-1.0)}
+
+
+class LongestMaxSize(DualTransform):
+ """Rescale an image so that maximum side is equal to max_size, keeping the aspect ratio of the initial image.
+
+ Args:
+ max_size (int, list of int): maximum size of the image after the transformation. When using a list, max size
+ will be randomly selected from the values in the list.
+ interpolation (OpenCV flag): interpolation method. Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ max_size: Union[int, Sequence[int]] = 1024,
+ interpolation: int = cv2.INTER_LINEAR,
+ always_apply: bool = False,
+ p: float = 1,
+ ):
+ super(LongestMaxSize, self).__init__(always_apply, p)
+ self.interpolation = interpolation
+ self.max_size = max_size
+
+ def apply(
+ self, img: np.ndarray, max_size: int = 1024, interpolation: int = cv2.INTER_LINEAR, **params
+ ) -> np.ndarray:
+ return F.longest_max_size(img, max_size=max_size, interpolation=interpolation)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ # Bounding box coordinates are scale invariant
+ return bbox
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, max_size: int = 1024, **params) -> KeypointInternalType:
+ height = params["rows"]
+ width = params["cols"]
+
+ scale = max_size / max([height, width])
+ return F.keypoint_scale(keypoint, scale, scale)
+
+ def get_params(self) -> Dict[str, int]:
+ return {"max_size": self.max_size if isinstance(self.max_size, int) else random.choice(self.max_size)}
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return ("max_size", "interpolation")
+
+
+class SmallestMaxSize(DualTransform):
+ """Rescale an image so that minimum side is equal to max_size, keeping the aspect ratio of the initial image.
+
+ Args:
+ max_size (int, list of int): maximum size of smallest side of the image after the transformation. When using a
+ list, max size will be randomly selected from the values in the list.
+ interpolation (OpenCV flag): interpolation method. Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ max_size: Union[int, Sequence[int]] = 1024,
+ interpolation: int = cv2.INTER_LINEAR,
+ always_apply: bool = False,
+ p: float = 1,
+ ):
+ super(SmallestMaxSize, self).__init__(always_apply, p)
+ self.interpolation = interpolation
+ self.max_size = max_size
+
+ def apply(
+ self, img: np.ndarray, max_size: int = 1024, interpolation: int = cv2.INTER_LINEAR, **params
+ ) -> np.ndarray:
+ return F.smallest_max_size(img, max_size=max_size, interpolation=interpolation)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return bbox
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, max_size: int = 1024, **params) -> KeypointInternalType:
+ height = params["rows"]
+ width = params["cols"]
+
+ scale = max_size / min([height, width])
+ return F.keypoint_scale(keypoint, scale, scale)
+
+ def get_params(self) -> Dict[str, int]:
+ return {"max_size": self.max_size if isinstance(self.max_size, int) else random.choice(self.max_size)}
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ return ("max_size", "interpolation")
+
+
+class Resize(DualTransform):
+ """Resize the input to the given height and width.
+
+ Args:
+ height (int): desired height of the output.
+ width (int): desired width of the output.
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, height, width, interpolation=cv2.INTER_LINEAR, always_apply=False, p=1):
+ super(Resize, self).__init__(always_apply, p)
+ self.height = height
+ self.width = width
+ self.interpolation = interpolation
+
+ def apply(self, img, interpolation=cv2.INTER_LINEAR, **params):
+ return F.resize(img, height=self.height, width=self.width, interpolation=interpolation)
+
+ def apply_to_bbox(self, bbox, **params):
+ # Bounding box coordinates are scale invariant
+ return bbox
+
+ def apply_to_keypoint(self, keypoint, **params):
+ height = params["rows"]
+ width = params["cols"]
+ scale_x = self.width / width
+ scale_y = self.height / height
+ return F.keypoint_scale(keypoint, scale_x, scale_y)
+
+ def get_transform_init_args_names(self):
+ return ("height", "width", "interpolation")
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/rotate.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/rotate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cdae691d6c85c81143c78d911112dc30a6334e6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/rotate.py
@@ -0,0 +1,294 @@
+import math
+import random
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+
+from ...core.transforms_interface import (
+ BoxInternalType,
+ DualTransform,
+ FillValueType,
+ KeypointInternalType,
+ to_tuple,
+)
+from ..crops import functional as FCrops
+from . import functional as F
+
+__all__ = ["Rotate", "RandomRotate90", "SafeRotate"]
+
+
+class RandomRotate90(DualTransform):
+ """Randomly rotate the input by 90 degrees zero or more times.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img, factor=0, **params):
+ """
+ Args:
+ factor (int): number of times the input will be rotated by 90 degrees.
+ """
+ return np.ascontiguousarray(np.rot90(img, factor))
+
+ def get_params(self):
+ # Random int in the range [0, 3]
+ return {"factor": random.randint(0, 3)}
+
+ def apply_to_bbox(self, bbox, factor=0, **params):
+ return F.bbox_rot90(bbox, factor, **params)
+
+ def apply_to_keypoint(self, keypoint, factor=0, **params):
+ return F.keypoint_rot90(keypoint, factor, **params)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class Rotate(DualTransform):
+ """Rotate the input by an angle selected randomly from the uniform distribution.
+
+ Args:
+ limit ((int, int) or int): range from which a random angle is picked. If limit is a single int
+ an angle is picked from (-limit, limit). Default: (-90, 90)
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ border_mode (OpenCV flag): flag that is used to specify the pixel extrapolation method. Should be one of:
+ cv2.BORDER_CONSTANT, cv2.BORDER_REPLICATE, cv2.BORDER_REFLECT, cv2.BORDER_WRAP, cv2.BORDER_REFLECT_101.
+ Default: cv2.BORDER_REFLECT_101
+ value (int, float, list of ints, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of ints,
+ list of float): padding value if border_mode is cv2.BORDER_CONSTANT applied for masks.
+ rotate_method (str): rotation method used for the bounding boxes. Should be one of "largest_box" or "ellipse".
+ Default: "largest_box"
+ crop_border (bool): If True would make a largest possible crop within rotated image
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ limit=90,
+ interpolation=cv2.INTER_LINEAR,
+ border_mode=cv2.BORDER_REFLECT_101,
+ value=None,
+ mask_value=None,
+ rotate_method="largest_box",
+ crop_border=False,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(Rotate, self).__init__(always_apply, p)
+ self.limit = to_tuple(limit)
+ self.interpolation = interpolation
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+ self.rotate_method = rotate_method
+ self.crop_border = crop_border
+
+ if rotate_method not in ["largest_box", "ellipse"]:
+ raise ValueError(f"Rotation method {self.rotate_method} is not valid.")
+
+ def apply(
+ self, img, angle=0, interpolation=cv2.INTER_LINEAR, x_min=None, x_max=None, y_min=None, y_max=None, **params
+ ):
+ img_out = F.rotate(img, angle, interpolation, self.border_mode, self.value)
+ if self.crop_border:
+ img_out = FCrops.crop(img_out, x_min, y_min, x_max, y_max)
+ return img_out
+
+ def apply_to_mask(self, img, angle=0, x_min=None, x_max=None, y_min=None, y_max=None, **params):
+ img_out = F.rotate(img, angle, cv2.INTER_NEAREST, self.border_mode, self.mask_value)
+ if self.crop_border:
+ img_out = FCrops.crop(img_out, x_min, y_min, x_max, y_max)
+ return img_out
+
+ def apply_to_bbox(self, bbox, angle=0, x_min=None, x_max=None, y_min=None, y_max=None, cols=0, rows=0, **params):
+ bbox_out = F.bbox_rotate(bbox, angle, self.rotate_method, rows, cols)
+ if self.crop_border:
+ bbox_out = FCrops.bbox_crop(bbox_out, x_min, y_min, x_max, y_max, rows, cols)
+ return bbox_out
+
+ def apply_to_keypoint(
+ self, keypoint, angle=0, x_min=None, x_max=None, y_min=None, y_max=None, cols=0, rows=0, **params
+ ):
+ keypoint_out = F.keypoint_rotate(keypoint, angle, rows, cols, **params)
+ if self.crop_border:
+ keypoint_out = FCrops.crop_keypoint_by_coords(keypoint_out, (x_min, y_min, x_max, y_max))
+ return keypoint_out
+
+ @staticmethod
+ def _rotated_rect_with_max_area(h, w, angle):
+ """
+ Given a rectangle of size wxh that has been rotated by 'angle' (in
+ degrees), computes the width and height of the largest possible
+ axis-aligned rectangle (maximal area) within the rotated rectangle.
+
+ Code from: https://stackoverflow.com/questions/16702966/rotate-image-and-crop-out-black-borders
+ """
+
+ angle = math.radians(angle)
+ width_is_longer = w >= h
+ side_long, side_short = (w, h) if width_is_longer else (h, w)
+
+ # since the solutions for angle, -angle and 180-angle are all the same,
+ # it is sufficient to look at the first quadrant and the absolute values of sin,cos:
+ sin_a, cos_a = abs(math.sin(angle)), abs(math.cos(angle))
+ if side_short <= 2.0 * sin_a * cos_a * side_long or abs(sin_a - cos_a) < 1e-10:
+ # half constrained case: two crop corners touch the longer side,
+ # the other two corners are on the mid-line parallel to the longer line
+ x = 0.5 * side_short
+ wr, hr = (x / sin_a, x / cos_a) if width_is_longer else (x / cos_a, x / sin_a)
+ else:
+ # fully constrained case: crop touches all 4 sides
+ cos_2a = cos_a * cos_a - sin_a * sin_a
+ wr, hr = (w * cos_a - h * sin_a) / cos_2a, (h * cos_a - w * sin_a) / cos_2a
+
+ return dict(
+ x_min=max(0, int(w / 2 - wr / 2)),
+ x_max=min(w, int(w / 2 + wr / 2)),
+ y_min=max(0, int(h / 2 - hr / 2)),
+ y_max=min(h, int(h / 2 + hr / 2)),
+ )
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, Any]:
+ out_params = {"angle": random.uniform(self.limit[0], self.limit[1])}
+ if self.crop_border:
+ h, w = params["image"].shape[:2]
+ out_params.update(self._rotated_rect_with_max_area(h, w, out_params["angle"]))
+ return out_params
+
+ def get_transform_init_args_names(self):
+ return ("limit", "interpolation", "border_mode", "value", "mask_value", "rotate_method", "crop_border")
+
+
+class SafeRotate(DualTransform):
+ """Rotate the input inside the input's frame by an angle selected randomly from the uniform distribution.
+
+ The resulting image may have artifacts in it. After rotation, the image may have a different aspect ratio, and
+ after resizing, it returns to its original shape with the original aspect ratio of the image. For these reason we
+ may see some artifacts.
+
+ Args:
+ limit ((int, int) or int): range from which a random angle is picked. If limit is a single int
+ an angle is picked from (-limit, limit). Default: (-90, 90)
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ border_mode (OpenCV flag): flag that is used to specify the pixel extrapolation method. Should be one of:
+ cv2.BORDER_CONSTANT, cv2.BORDER_REPLICATE, cv2.BORDER_REFLECT, cv2.BORDER_WRAP, cv2.BORDER_REFLECT_101.
+ Default: cv2.BORDER_REFLECT_101
+ value (int, float, list of ints, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of ints,
+ list of float): padding value if border_mode is cv2.BORDER_CONSTANT applied for masks.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ limit: Union[float, Tuple[float, float]] = 90,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: FillValueType = None,
+ mask_value: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(SafeRotate, self).__init__(always_apply, p)
+ self.limit = to_tuple(limit)
+ self.interpolation = interpolation
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+
+ def apply(self, img: np.ndarray, matrix: np.ndarray = np.array(None), **params) -> np.ndarray:
+ return F.safe_rotate(img, matrix, self.interpolation, self.value, self.border_mode)
+
+ def apply_to_mask(self, img: np.ndarray, matrix: np.ndarray = np.array(None), **params) -> np.ndarray:
+ return F.safe_rotate(img, matrix, cv2.INTER_NEAREST, self.mask_value, self.border_mode)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, cols: int = 0, rows: int = 0, **params) -> BoxInternalType:
+ return F.bbox_safe_rotate(bbox, params["matrix"], cols, rows)
+
+ def apply_to_keypoint(
+ self,
+ keypoint: KeypointInternalType,
+ angle: float = 0,
+ scale_x: float = 0,
+ scale_y: float = 0,
+ cols: int = 0,
+ rows: int = 0,
+ **params
+ ) -> KeypointInternalType:
+ return F.keypoint_safe_rotate(keypoint, params["matrix"], angle, scale_x, scale_y, cols, rows)
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, Any]:
+ angle = random.uniform(self.limit[0], self.limit[1])
+
+ image = params["image"]
+ h, w = image.shape[:2]
+
+ # https://stackoverflow.com/questions/43892506/opencv-python-rotate-image-without-cropping-sides
+ image_center = (w / 2, h / 2)
+
+ # Rotation Matrix
+ rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
+
+ # rotation calculates the cos and sin, taking absolutes of those.
+ abs_cos = abs(rotation_mat[0, 0])
+ abs_sin = abs(rotation_mat[0, 1])
+
+ # find the new width and height bounds
+ new_w = math.ceil(h * abs_sin + w * abs_cos)
+ new_h = math.ceil(h * abs_cos + w * abs_sin)
+
+ scale_x = w / new_w
+ scale_y = h / new_h
+
+ # Shift the image to create padding
+ rotation_mat[0, 2] += new_w / 2 - image_center[0]
+ rotation_mat[1, 2] += new_h / 2 - image_center[1]
+
+ # Rescale to original size
+ scale_mat = np.diag(np.ones(3))
+ scale_mat[0, 0] *= scale_x
+ scale_mat[1, 1] *= scale_y
+ _tmp = np.diag(np.ones(3))
+ _tmp[:2] = rotation_mat
+ _tmp = scale_mat @ _tmp
+ rotation_mat = _tmp[:2]
+
+ return {"matrix": rotation_mat, "angle": angle, "scale_x": scale_x, "scale_y": scale_y}
+
+ def get_transform_init_args_names(self) -> Tuple[str, str, str, str, str]:
+ return ("limit", "interpolation", "border_mode", "value", "mask_value")
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/transforms.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..440c4cfdb1e4fa777fa3b3c79218f237cfccfb2d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/geometric/transforms.py
@@ -0,0 +1,1499 @@
+import math
+import random
+from enum import Enum
+from typing import Dict, Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+import skimage.transform
+
+from custom_albumentations.core.bbox_utils import denormalize_bbox, normalize_bbox
+
+from ... import random_utils
+from ...core.transforms_interface import (
+ BoxInternalType,
+ DualTransform,
+ ImageColorType,
+ KeypointInternalType,
+ ScaleFloatType,
+ to_tuple,
+)
+from ..functional import bbox_from_mask
+from . import functional as F
+
+__all__ = [
+ "ShiftScaleRotate",
+ "ElasticTransform",
+ "Perspective",
+ "Affine",
+ "PiecewiseAffine",
+ "VerticalFlip",
+ "HorizontalFlip",
+ "Flip",
+ "Transpose",
+ "OpticalDistortion",
+ "GridDistortion",
+ "PadIfNeeded",
+]
+
+
+class ShiftScaleRotate(DualTransform):
+ """Randomly apply affine transforms: translate, scale and rotate the input.
+
+ Args:
+ shift_limit ((float, float) or float): shift factor range for both height and width. If shift_limit
+ is a single float value, the range will be (-shift_limit, shift_limit). Absolute values for lower and
+ upper bounds should lie in range [0, 1]. Default: (-0.0625, 0.0625).
+ scale_limit ((float, float) or float): scaling factor range. If scale_limit is a single float value, the
+ range will be (-scale_limit, scale_limit). Note that the scale_limit will be biased by 1.
+ If scale_limit is a tuple, like (low, high), sampling will be done from the range (1 + low, 1 + high).
+ Default: (-0.1, 0.1).
+ rotate_limit ((int, int) or int): rotation range. If rotate_limit is a single int value, the
+ range will be (-rotate_limit, rotate_limit). Default: (-45, 45).
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ border_mode (OpenCV flag): flag that is used to specify the pixel extrapolation method. Should be one of:
+ cv2.BORDER_CONSTANT, cv2.BORDER_REPLICATE, cv2.BORDER_REFLECT, cv2.BORDER_WRAP, cv2.BORDER_REFLECT_101.
+ Default: cv2.BORDER_REFLECT_101
+ value (int, float, list of int, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of int,
+ list of float): padding value if border_mode is cv2.BORDER_CONSTANT applied for masks.
+ shift_limit_x ((float, float) or float): shift factor range for width. If it is set then this value
+ instead of shift_limit will be used for shifting width. If shift_limit_x is a single float value,
+ the range will be (-shift_limit_x, shift_limit_x). Absolute values for lower and upper bounds should lie in
+ the range [0, 1]. Default: None.
+ shift_limit_y ((float, float) or float): shift factor range for height. If it is set then this value
+ instead of shift_limit will be used for shifting height. If shift_limit_y is a single float value,
+ the range will be (-shift_limit_y, shift_limit_y). Absolute values for lower and upper bounds should lie
+ in the range [0, 1]. Default: None.
+ rotate_method (str): rotation method used for the bounding boxes. Should be one of "largest_box" or "ellipse".
+ Default: "largest_box"
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ shift_limit=0.0625,
+ scale_limit=0.1,
+ rotate_limit=45,
+ interpolation=cv2.INTER_LINEAR,
+ border_mode=cv2.BORDER_REFLECT_101,
+ value=None,
+ mask_value=None,
+ shift_limit_x=None,
+ shift_limit_y=None,
+ rotate_method="largest_box",
+ always_apply=False,
+ p=0.5,
+ ):
+ super(ShiftScaleRotate, self).__init__(always_apply, p)
+ self.shift_limit_x = to_tuple(shift_limit_x if shift_limit_x is not None else shift_limit)
+ self.shift_limit_y = to_tuple(shift_limit_y if shift_limit_y is not None else shift_limit)
+ self.scale_limit = to_tuple(scale_limit, bias=1.0)
+ self.rotate_limit = to_tuple(rotate_limit)
+ self.interpolation = interpolation
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+ self.rotate_method = rotate_method
+
+ if self.rotate_method not in ["largest_box", "ellipse"]:
+ raise ValueError(f"Rotation method {self.rotate_method} is not valid.")
+
+ def apply(self, img, angle=0, scale=0, dx=0, dy=0, interpolation=cv2.INTER_LINEAR, **params):
+ return F.shift_scale_rotate(img, angle, scale, dx, dy, interpolation, self.border_mode, self.value)
+
+ def apply_to_mask(self, img, angle=0, scale=0, dx=0, dy=0, **params):
+ return F.shift_scale_rotate(img, angle, scale, dx, dy, cv2.INTER_NEAREST, self.border_mode, self.mask_value)
+
+ def apply_to_keypoint(self, keypoint, angle=0, scale=0, dx=0, dy=0, rows=0, cols=0, **params):
+ return F.keypoint_shift_scale_rotate(keypoint, angle, scale, dx, dy, rows, cols)
+
+ def get_params(self):
+ return {
+ "angle": random.uniform(self.rotate_limit[0], self.rotate_limit[1]),
+ "scale": random.uniform(self.scale_limit[0], self.scale_limit[1]),
+ "dx": random.uniform(self.shift_limit_x[0], self.shift_limit_x[1]),
+ "dy": random.uniform(self.shift_limit_y[0], self.shift_limit_y[1]),
+ }
+
+ def apply_to_bbox(self, bbox, angle, scale, dx, dy, **params):
+ return F.bbox_shift_scale_rotate(bbox, angle, scale, dx, dy, self.rotate_method, **params)
+
+ def get_transform_init_args(self):
+ return {
+ "shift_limit_x": self.shift_limit_x,
+ "shift_limit_y": self.shift_limit_y,
+ "scale_limit": to_tuple(self.scale_limit, bias=-1.0),
+ "rotate_limit": self.rotate_limit,
+ "interpolation": self.interpolation,
+ "border_mode": self.border_mode,
+ "value": self.value,
+ "mask_value": self.mask_value,
+ "rotate_method": self.rotate_method,
+ }
+
+
+class ElasticTransform(DualTransform):
+ """Elastic deformation of images as described in [Simard2003]_ (with modifications).
+ Based on https://gist.github.com/ernestum/601cdf56d2b424757de5
+
+ .. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
+ Convolutional Neural Networks applied to Visual Document Analysis", in
+ Proc. of the International Conference on Document Analysis and
+ Recognition, 2003.
+
+ Args:
+ alpha (float):
+ sigma (float): Gaussian filter parameter.
+ alpha_affine (float): The range will be (-alpha_affine, alpha_affine)
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ border_mode (OpenCV flag): flag that is used to specify the pixel extrapolation method. Should be one of:
+ cv2.BORDER_CONSTANT, cv2.BORDER_REPLICATE, cv2.BORDER_REFLECT, cv2.BORDER_WRAP, cv2.BORDER_REFLECT_101.
+ Default: cv2.BORDER_REFLECT_101
+ value (int, float, list of ints, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of ints,
+ list of float): padding value if border_mode is cv2.BORDER_CONSTANT applied for masks.
+ approximate (boolean): Whether to smooth displacement map with fixed kernel size.
+ Enabling this option gives ~2X speedup on large images.
+ same_dxdy (boolean): Whether to use same random generated shift for x and y.
+ Enabling this option gives ~2X speedup.
+
+ Targets:
+ image, mask, bbox
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ alpha=1,
+ sigma=50,
+ alpha_affine=50,
+ interpolation=cv2.INTER_LINEAR,
+ border_mode=cv2.BORDER_REFLECT_101,
+ value=None,
+ mask_value=None,
+ always_apply=False,
+ approximate=False,
+ same_dxdy=False,
+ p=0.5,
+ ):
+ super(ElasticTransform, self).__init__(always_apply, p)
+ self.alpha = alpha
+ self.alpha_affine = alpha_affine
+ self.sigma = sigma
+ self.interpolation = interpolation
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+ self.approximate = approximate
+ self.same_dxdy = same_dxdy
+
+ def apply(self, img, random_state=None, interpolation=cv2.INTER_LINEAR, **params):
+ return F.elastic_transform(
+ img,
+ self.alpha,
+ self.sigma,
+ self.alpha_affine,
+ interpolation,
+ self.border_mode,
+ self.value,
+ np.random.RandomState(random_state),
+ self.approximate,
+ self.same_dxdy,
+ )
+
+ def apply_to_mask(self, img, random_state=None, **params):
+ return F.elastic_transform(
+ img,
+ self.alpha,
+ self.sigma,
+ self.alpha_affine,
+ cv2.INTER_NEAREST,
+ self.border_mode,
+ self.mask_value,
+ np.random.RandomState(random_state),
+ self.approximate,
+ self.same_dxdy,
+ )
+
+ def apply_to_bbox(self, bbox, random_state=None, **params):
+ rows, cols = params["rows"], params["cols"]
+ mask = np.zeros((rows, cols), dtype=np.uint8)
+ bbox_denorm = F.denormalize_bbox(bbox, rows, cols)
+ x_min, y_min, x_max, y_max = bbox_denorm[:4]
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+ mask[y_min:y_max, x_min:x_max] = 1
+ mask = F.elastic_transform(
+ mask,
+ self.alpha,
+ self.sigma,
+ self.alpha_affine,
+ cv2.INTER_NEAREST,
+ self.border_mode,
+ self.mask_value,
+ np.random.RandomState(random_state),
+ self.approximate,
+ )
+ bbox_returned = bbox_from_mask(mask)
+ bbox_returned = F.normalize_bbox(bbox_returned, rows, cols)
+ return bbox_returned
+
+ def get_params(self):
+ return {"random_state": random.randint(0, 10000)}
+
+ def get_transform_init_args_names(self):
+ return (
+ "alpha",
+ "sigma",
+ "alpha_affine",
+ "interpolation",
+ "border_mode",
+ "value",
+ "mask_value",
+ "approximate",
+ "same_dxdy",
+ )
+
+
+class Perspective(DualTransform):
+ """Perform a random four point perspective transform of the input.
+
+ Args:
+ scale (float or (float, float)): standard deviation of the normal distributions. These are used to sample
+ the random distances of the subimage's corners from the full image's corners.
+ If scale is a single float value, the range will be (0, scale). Default: (0.05, 0.1).
+ keep_size (bool): Whether to resize image’s back to their original size after applying the perspective
+ transform. If set to False, the resulting images may end up having different shapes
+ and will always be a list, never an array. Default: True
+ pad_mode (OpenCV flag): OpenCV border mode.
+ pad_val (int, float, list of int, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ Default: 0
+ mask_pad_val (int, float, list of int, list of float): padding value for mask
+ if border_mode is cv2.BORDER_CONSTANT. Default: 0
+ fit_output (bool): If True, the image plane size and position will be adjusted to still capture
+ the whole image after perspective transformation. (Followed by image resizing if keep_size is set to True.)
+ Otherwise, parts of the transformed image may be outside of the image plane.
+ This setting should not be set to True when using large scale values as it could lead to very large images.
+ Default: False
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, keypoints, bboxes
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ scale=(0.05, 0.1),
+ keep_size=True,
+ pad_mode=cv2.BORDER_CONSTANT,
+ pad_val=0,
+ mask_pad_val=0,
+ fit_output=False,
+ interpolation=cv2.INTER_LINEAR,
+ always_apply=False,
+ p=0.5,
+ ):
+ super().__init__(always_apply, p)
+ self.scale = to_tuple(scale, 0)
+ self.keep_size = keep_size
+ self.pad_mode = pad_mode
+ self.pad_val = pad_val
+ self.mask_pad_val = mask_pad_val
+ self.fit_output = fit_output
+ self.interpolation = interpolation
+
+ def apply(self, img, matrix=None, max_height=None, max_width=None, **params):
+ return F.perspective(
+ img, matrix, max_width, max_height, self.pad_val, self.pad_mode, self.keep_size, params["interpolation"]
+ )
+
+ def apply_to_bbox(self, bbox, matrix=None, max_height=None, max_width=None, **params):
+ return F.perspective_bbox(bbox, params["rows"], params["cols"], matrix, max_width, max_height, self.keep_size)
+
+ def apply_to_keypoint(self, keypoint, matrix=None, max_height=None, max_width=None, **params):
+ return F.perspective_keypoint(
+ keypoint, params["rows"], params["cols"], matrix, max_width, max_height, self.keep_size
+ )
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ h, w = params["image"].shape[:2]
+
+ scale = random_utils.uniform(*self.scale)
+ points = random_utils.normal(0, scale, [4, 2])
+ points = np.mod(np.abs(points), 0.32)
+
+ # top left -- no changes needed, just use jitter
+ # top right
+ points[1, 0] = 1.0 - points[1, 0] # w = 1.0 - jitter
+ # bottom right
+ points[2] = 1.0 - points[2] # w = 1.0 - jitt
+ # bottom left
+ points[3, 1] = 1.0 - points[3, 1] # h = 1.0 - jitter
+
+ points[:, 0] *= w
+ points[:, 1] *= h
+
+ # Obtain a consistent order of the points and unpack them individually.
+ # Warning: don't just do (tl, tr, br, bl) = _order_points(...)
+ # here, because the reordered points is used further below.
+ points = self._order_points(points)
+ tl, tr, br, bl = points
+
+ # compute the width of the new image, which will be the
+ # maximum distance between bottom-right and bottom-left
+ # x-coordiates or the top-right and top-left x-coordinates
+ min_width = None
+ max_width = None
+ while min_width is None or min_width < 2:
+ width_top = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+ width_bottom = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+ max_width = int(max(width_top, width_bottom))
+ min_width = int(min(width_top, width_bottom))
+ if min_width < 2:
+ step_size = (2 - min_width) / 2
+ tl[0] -= step_size
+ tr[0] += step_size
+ bl[0] -= step_size
+ br[0] += step_size
+
+ # compute the height of the new image, which will be the maximum distance between the top-right
+ # and bottom-right y-coordinates or the top-left and bottom-left y-coordinates
+ min_height = None
+ max_height = None
+ while min_height is None or min_height < 2:
+ height_right = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+ height_left = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+ max_height = int(max(height_right, height_left))
+ min_height = int(min(height_right, height_left))
+ if min_height < 2:
+ step_size = (2 - min_height) / 2
+ tl[1] -= step_size
+ tr[1] -= step_size
+ bl[1] += step_size
+ br[1] += step_size
+
+ # now that we have the dimensions of the new image, construct
+ # the set of destination points to obtain a "birds eye view",
+ # (i.e. top-down view) of the image, again specifying points
+ # in the top-left, top-right, bottom-right, and bottom-left order
+ # do not use width-1 or height-1 here, as for e.g. width=3, height=2
+ # the bottom right coordinate is at (3.0, 2.0) and not (2.0, 1.0)
+ dst = np.array([[0, 0], [max_width, 0], [max_width, max_height], [0, max_height]], dtype=np.float32)
+
+ # compute the perspective transform matrix and then apply it
+ m = cv2.getPerspectiveTransform(points, dst)
+
+ if self.fit_output:
+ m, max_width, max_height = self._expand_transform(m, (h, w))
+
+ return {"matrix": m, "max_height": max_height, "max_width": max_width, "interpolation": self.interpolation}
+
+ @classmethod
+ def _expand_transform(cls, matrix, shape):
+ height, width = shape
+ # do not use width-1 or height-1 here, as for e.g. width=3, height=2, max_height
+ # the bottom right coordinate is at (3.0, 2.0) and not (2.0, 1.0)
+ rect = np.array([[0, 0], [width, 0], [width, height], [0, height]], dtype=np.float32)
+ dst = cv2.perspectiveTransform(np.array([rect]), matrix)[0]
+
+ # get min x, y over transformed 4 points
+ # then modify target points by subtracting these minima => shift to (0, 0)
+ dst -= dst.min(axis=0, keepdims=True)
+ dst = np.around(dst, decimals=0)
+
+ matrix_expanded = cv2.getPerspectiveTransform(rect, dst)
+ max_width, max_height = dst.max(axis=0)
+ return matrix_expanded, int(max_width), int(max_height)
+
+ @staticmethod
+ def _order_points(pts: np.ndarray) -> np.ndarray:
+ pts = np.array(sorted(pts, key=lambda x: x[0]))
+ left = pts[:2] # points with smallest x coordinate - left points
+ right = pts[2:] # points with greatest x coordinate - right points
+
+ if left[0][1] < left[1][1]:
+ tl, bl = left
+ else:
+ bl, tl = left
+
+ if right[0][1] < right[1][1]:
+ tr, br = right
+ else:
+ br, tr = right
+
+ return np.array([tl, tr, br, bl], dtype=np.float32)
+
+ def get_transform_init_args_names(self):
+ return "scale", "keep_size", "pad_mode", "pad_val", "mask_pad_val", "fit_output", "interpolation"
+
+
+class Affine(DualTransform):
+ """Augmentation to apply affine transformations to images.
+ This is mostly a wrapper around the corresponding classes and functions in OpenCV.
+
+ Affine transformations involve:
+
+ - Translation ("move" image on the x-/y-axis)
+ - Rotation
+ - Scaling ("zoom" in/out)
+ - Shear (move one side of the image, turning a square into a trapezoid)
+
+ All such transformations can create "new" pixels in the image without a defined content, e.g.
+ if the image is translated to the left, pixels are created on the right.
+ A method has to be defined to deal with these pixel values.
+ The parameters `cval` and `mode` of this class deal with this.
+
+ Some transformations involve interpolations between several pixels
+ of the input image to generate output pixel values. The parameters `interpolation` and
+ `mask_interpolation` deals with the method of interpolation used for this.
+
+ Args:
+ scale (number, tuple of number or dict): Scaling factor to use, where ``1.0`` denotes "no change" and
+ ``0.5`` is zoomed out to ``50`` percent of the original size.
+ * If a single number, then that value will be used for all images.
+ * If a tuple ``(a, b)``, then a value will be uniformly sampled per image from the interval ``[a, b]``.
+ That the same range will be used for both x- and y-axis. To keep the aspect ratio, set
+ ``keep_ratio=True``, then the same value will be used for both x- and y-axis.
+ * If a dictionary, then it is expected to have the keys ``x`` and/or ``y``.
+ Each of these keys can have the same values as described above.
+ Using a dictionary allows to set different values for the two axis and sampling will then happen
+ *independently* per axis, resulting in samples that differ between the axes. Note that when
+ the ``keep_ratio=True``, the x- and y-axis ranges should be the same.
+ translate_percent (None, number, tuple of number or dict): Translation as a fraction of the image height/width
+ (x-translation, y-translation), where ``0`` denotes "no change"
+ and ``0.5`` denotes "half of the axis size".
+ * If ``None`` then equivalent to ``0.0`` unless `translate_px` has a value other than ``None``.
+ * If a single number, then that value will be used for all images.
+ * If a tuple ``(a, b)``, then a value will be uniformly sampled per image from the interval ``[a, b]``.
+ That sampled fraction value will be used identically for both x- and y-axis.
+ * If a dictionary, then it is expected to have the keys ``x`` and/or ``y``.
+ Each of these keys can have the same values as described above.
+ Using a dictionary allows to set different values for the two axis and sampling will then happen
+ *independently* per axis, resulting in samples that differ between the axes.
+ translate_px (None, int, tuple of int or dict): Translation in pixels.
+ * If ``None`` then equivalent to ``0`` unless `translate_percent` has a value other than ``None``.
+ * If a single int, then that value will be used for all images.
+ * If a tuple ``(a, b)``, then a value will be uniformly sampled per image from
+ the discrete interval ``[a..b]``. That number will be used identically for both x- and y-axis.
+ * If a dictionary, then it is expected to have the keys ``x`` and/or ``y``.
+ Each of these keys can have the same values as described above.
+ Using a dictionary allows to set different values for the two axis and sampling will then happen
+ *independently* per axis, resulting in samples that differ between the axes.
+ rotate (number or tuple of number): Rotation in degrees (**NOT** radians), i.e. expected value range is
+ around ``[-360, 360]``. Rotation happens around the *center* of the image,
+ not the top left corner as in some other frameworks.
+ * If a number, then that value will be used for all images.
+ * If a tuple ``(a, b)``, then a value will be uniformly sampled per image from the interval ``[a, b]``
+ and used as the rotation value.
+ shear (number, tuple of number or dict): Shear in degrees (**NOT** radians), i.e. expected value range is
+ around ``[-360, 360]``, with reasonable values being in the range of ``[-45, 45]``.
+ * If a number, then that value will be used for all images as
+ the shear on the x-axis (no shear on the y-axis will be done).
+ * If a tuple ``(a, b)``, then two value will be uniformly sampled per image
+ from the interval ``[a, b]`` and be used as the x- and y-shear value.
+ * If a dictionary, then it is expected to have the keys ``x`` and/or ``y``.
+ Each of these keys can have the same values as described above.
+ Using a dictionary allows to set different values for the two axis and sampling will then happen
+ *independently* per axis, resulting in samples that differ between the axes.
+ interpolation (int): OpenCV interpolation flag.
+ mask_interpolation (int): OpenCV interpolation flag.
+ cval (number or sequence of number): The constant value to use when filling in newly created pixels.
+ (E.g. translating by 1px to the right will create a new 1px-wide column of pixels
+ on the left of the image).
+ The value is only used when `mode=constant`. The expected value range is ``[0, 255]`` for ``uint8`` images.
+ cval_mask (number or tuple of number): Same as cval but only for masks.
+ mode (int): OpenCV border flag.
+ fit_output (bool): If True, the image plane size and position will be adjusted to tightly capture
+ the whole image after affine transformation (`translate_percent` and `translate_px` are ignored).
+ Otherwise (``False``), parts of the transformed image may end up outside the image plane.
+ Fitting the output shape can be useful to avoid corners of the image being outside the image plane
+ after applying rotations. Default: False
+ keep_ratio (bool): When True, the original aspect ratio will be kept when the random scale is applied.
+ Default: False.
+ rotate_method (str): rotation method used for the bounding boxes. Should be one of "largest_box" or
+ "ellipse"[1].
+ Default: "largest_box"
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, keypoints, bboxes
+
+ Image types:
+ uint8, float32
+
+ Reference:
+ [1] https://arxiv.org/abs/2109.13488
+ """
+
+ def __init__(
+ self,
+ scale: Optional[Union[float, Sequence[float], dict]] = None,
+ translate_percent: Optional[Union[float, Sequence[float], dict]] = None,
+ translate_px: Optional[Union[int, Sequence[int], dict]] = None,
+ rotate: Optional[Union[float, Sequence[float]]] = None,
+ shear: Optional[Union[float, Sequence[float], dict]] = None,
+ interpolation: int = cv2.INTER_LINEAR,
+ mask_interpolation: int = cv2.INTER_NEAREST,
+ cval: Union[int, float, Sequence[int], Sequence[float]] = 0,
+ cval_mask: Union[int, float, Sequence[int], Sequence[float]] = 0,
+ mode: int = cv2.BORDER_CONSTANT,
+ fit_output: bool = False,
+ keep_ratio: bool = False,
+ rotate_method: str = "largest_box",
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply=always_apply, p=p)
+
+ params = [scale, translate_percent, translate_px, rotate, shear]
+ if all([p is None for p in params]):
+ scale = {"x": (0.9, 1.1), "y": (0.9, 1.1)}
+ translate_percent = {"x": (-0.1, 0.1), "y": (-0.1, 0.1)}
+ rotate = (-15, 15)
+ shear = {"x": (-10, 10), "y": (-10, 10)}
+ else:
+ scale = scale if scale is not None else 1.0
+ rotate = rotate if rotate is not None else 0.0
+ shear = shear if shear is not None else 0.0
+
+ self.interpolation = interpolation
+ self.mask_interpolation = mask_interpolation
+ self.cval = cval
+ self.cval_mask = cval_mask
+ self.mode = mode
+ self.scale = self._handle_dict_arg(scale, "scale")
+ self.translate_percent, self.translate_px = self._handle_translate_arg(translate_px, translate_percent)
+ self.rotate = to_tuple(rotate, rotate)
+ self.fit_output = fit_output
+ self.shear = self._handle_dict_arg(shear, "shear")
+ self.keep_ratio = keep_ratio
+ self.rotate_method = rotate_method
+
+ if self.keep_ratio and self.scale["x"] != self.scale["y"]:
+ raise ValueError(
+ "When keep_ratio is True, the x and y scale range should be identical. got {}".format(self.scale)
+ )
+
+ def get_transform_init_args_names(self):
+ return (
+ "interpolation",
+ "mask_interpolation",
+ "cval",
+ "mode",
+ "scale",
+ "translate_percent",
+ "translate_px",
+ "rotate",
+ "fit_output",
+ "shear",
+ "cval_mask",
+ "keep_ratio",
+ "rotate_method",
+ )
+
+ @staticmethod
+ def _handle_dict_arg(val: Union[float, Sequence[float], dict], name: str, default: float = 1.0):
+ if isinstance(val, dict):
+ if "x" not in val and "y" not in val:
+ raise ValueError(
+ f'Expected {name} dictionary to contain at least key "x" or ' 'key "y". Found neither of them.'
+ )
+ x = val.get("x", default)
+ y = val.get("y", default)
+ return {"x": to_tuple(x, x), "y": to_tuple(y, y)}
+ return {"x": to_tuple(val, val), "y": to_tuple(val, val)}
+
+ @classmethod
+ def _handle_translate_arg(
+ cls,
+ translate_px: Optional[Union[float, Sequence[float], dict]],
+ translate_percent: Optional[Union[float, Sequence[float], dict]],
+ ):
+ if translate_percent is None and translate_px is None:
+ translate_px = 0
+
+ if translate_percent is not None and translate_px is not None:
+ raise ValueError(
+ "Expected either translate_percent or translate_px to be " "provided, " "but neither of them was."
+ )
+
+ if translate_percent is not None:
+ # translate by percent
+ return cls._handle_dict_arg(translate_percent, "translate_percent", default=0.0), translate_px
+
+ if translate_px is None:
+ raise ValueError("translate_px is None.")
+ # translate by pixels
+ return translate_percent, cls._handle_dict_arg(translate_px, "translate_px")
+
+ def apply(
+ self,
+ img: np.ndarray,
+ matrix: skimage.transform.ProjectiveTransform = None,
+ output_shape: Sequence[int] = (),
+ **params
+ ) -> np.ndarray:
+ return F.warp_affine(
+ img,
+ matrix,
+ interpolation=self.interpolation,
+ cval=self.cval,
+ mode=self.mode,
+ output_shape=output_shape,
+ )
+
+ def apply_to_mask(
+ self,
+ img: np.ndarray,
+ matrix: skimage.transform.ProjectiveTransform = None,
+ output_shape: Sequence[int] = (),
+ **params
+ ) -> np.ndarray:
+ return F.warp_affine(
+ img,
+ matrix,
+ interpolation=self.mask_interpolation,
+ cval=self.cval_mask,
+ mode=self.mode,
+ output_shape=output_shape,
+ )
+
+ def apply_to_bbox(
+ self,
+ bbox: BoxInternalType,
+ matrix: skimage.transform.ProjectiveTransform = None,
+ rows: int = 0,
+ cols: int = 0,
+ output_shape: Sequence[int] = (),
+ **params
+ ) -> BoxInternalType:
+ return F.bbox_affine(bbox, matrix, self.rotate_method, rows, cols, output_shape)
+
+ def apply_to_keypoint(
+ self,
+ keypoint: KeypointInternalType,
+ matrix: Optional[skimage.transform.ProjectiveTransform] = None,
+ scale: Optional[dict] = None,
+ **params
+ ) -> KeypointInternalType:
+ assert scale is not None and matrix is not None
+ return F.keypoint_affine(keypoint, matrix=matrix, scale=scale)
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params: dict) -> dict:
+ h, w = params["image"].shape[:2]
+
+ translate: Dict[str, Union[int, float]]
+ if self.translate_px is not None:
+ translate = {key: random.randint(*value) for key, value in self.translate_px.items()}
+ elif self.translate_percent is not None:
+ translate = {key: random.uniform(*value) for key, value in self.translate_percent.items()}
+ translate["x"] = translate["x"] * w
+ translate["y"] = translate["y"] * h
+ else:
+ translate = {"x": 0, "y": 0}
+
+ # Look to issue https://github.com/albumentations-team/albumentations/issues/1079
+ shear = {key: -random.uniform(*value) for key, value in self.shear.items()}
+ scale = {key: random.uniform(*value) for key, value in self.scale.items()}
+ if self.keep_ratio:
+ scale["y"] = scale["x"]
+
+ # Look to issue https://github.com/albumentations-team/albumentations/issues/1079
+ rotate = -random.uniform(*self.rotate)
+
+ # for images we use additional shifts of (0.5, 0.5) as otherwise
+ # we get an ugly black border for 90deg rotations
+ shift_x = w / 2 - 0.5
+ shift_y = h / 2 - 0.5
+
+ matrix_to_topleft = skimage.transform.SimilarityTransform(translation=[-shift_x, -shift_y])
+ matrix_shear_y_rot = skimage.transform.AffineTransform(rotation=-np.pi / 2)
+ matrix_shear_y = skimage.transform.AffineTransform(shear=np.deg2rad(shear["y"]))
+ matrix_shear_y_rot_inv = skimage.transform.AffineTransform(rotation=np.pi / 2)
+ matrix_transforms = skimage.transform.AffineTransform(
+ scale=(scale["x"], scale["y"]),
+ translation=(translate["x"], translate["y"]),
+ rotation=np.deg2rad(rotate),
+ shear=np.deg2rad(shear["x"]),
+ )
+ matrix_to_center = skimage.transform.SimilarityTransform(translation=[shift_x, shift_y])
+ matrix = (
+ matrix_to_topleft
+ + matrix_shear_y_rot
+ + matrix_shear_y
+ + matrix_shear_y_rot_inv
+ + matrix_transforms
+ + matrix_to_center
+ )
+ if self.fit_output:
+ matrix, output_shape = self._compute_affine_warp_output_shape(matrix, params["image"].shape)
+ else:
+ output_shape = params["image"].shape
+
+ return {
+ "rotate": rotate,
+ "scale": scale,
+ "matrix": matrix,
+ "output_shape": output_shape,
+ }
+
+ @staticmethod
+ def _compute_affine_warp_output_shape(
+ matrix: skimage.transform.ProjectiveTransform, input_shape: Sequence[int]
+ ) -> Tuple[skimage.transform.ProjectiveTransform, Sequence[int]]:
+ height, width = input_shape[:2]
+
+ if height == 0 or width == 0:
+ return matrix, input_shape
+
+ # determine shape of output image
+ corners = np.array([[0, 0], [0, height - 1], [width - 1, height - 1], [width - 1, 0]])
+ corners = matrix(corners)
+ minc = corners[:, 0].min()
+ minr = corners[:, 1].min()
+ maxc = corners[:, 0].max()
+ maxr = corners[:, 1].max()
+ out_height = maxr - minr + 1
+ out_width = maxc - minc + 1
+ if len(input_shape) == 3:
+ output_shape = np.ceil((out_height, out_width, input_shape[2]))
+ else:
+ output_shape = np.ceil((out_height, out_width))
+ output_shape_tuple = tuple([int(v) for v in output_shape.tolist()])
+ # fit output image in new shape
+ translation = (-minc, -minr)
+ matrix_to_fit = skimage.transform.SimilarityTransform(translation=translation)
+ matrix = matrix + matrix_to_fit
+ return matrix, output_shape_tuple
+
+
+class PiecewiseAffine(DualTransform):
+ """Apply affine transformations that differ between local neighbourhoods.
+ This augmentation places a regular grid of points on an image and randomly moves the neighbourhood of these point
+ around via affine transformations. This leads to local distortions.
+
+ This is mostly a wrapper around scikit-image's ``PiecewiseAffine``.
+ See also ``Affine`` for a similar technique.
+
+ Note:
+ This augmenter is very slow. Try to use ``ElasticTransformation`` instead, which is at least 10x faster.
+
+ Note:
+ For coordinate-based inputs (keypoints, bounding boxes, polygons, ...),
+ this augmenter still has to perform an image-based augmentation,
+ which will make it significantly slower and not fully correct for such inputs than other transforms.
+
+ Args:
+ scale (float, tuple of float): Each point on the regular grid is moved around via a normal distribution.
+ This scale factor is equivalent to the normal distribution's sigma.
+ Note that the jitter (how far each point is moved in which direction) is multiplied by the height/width of
+ the image if ``absolute_scale=False`` (default), so this scale can be the same for different sized images.
+ Recommended values are in the range ``0.01`` to ``0.05`` (weak to strong augmentations).
+ * If a single ``float``, then that value will always be used as the scale.
+ * If a tuple ``(a, b)`` of ``float`` s, then a random value will
+ be uniformly sampled per image from the interval ``[a, b]``.
+ nb_rows (int, tuple of int): Number of rows of points that the regular grid should have.
+ Must be at least ``2``. For large images, you might want to pick a higher value than ``4``.
+ You might have to then adjust scale to lower values.
+ * If a single ``int``, then that value will always be used as the number of rows.
+ * If a tuple ``(a, b)``, then a value from the discrete interval
+ ``[a..b]`` will be uniformly sampled per image.
+ nb_cols (int, tuple of int): Number of columns. Analogous to `nb_rows`.
+ interpolation (int): The order of interpolation. The order has to be in the range 0-5:
+ - 0: Nearest-neighbor
+ - 1: Bi-linear (default)
+ - 2: Bi-quadratic
+ - 3: Bi-cubic
+ - 4: Bi-quartic
+ - 5: Bi-quintic
+ mask_interpolation (int): same as interpolation but for mask.
+ cval (number): The constant value to use when filling in newly created pixels.
+ cval_mask (number): Same as cval but only for masks.
+ mode (str): {'constant', 'edge', 'symmetric', 'reflect', 'wrap'}, optional
+ Points outside the boundaries of the input are filled according
+ to the given mode. Modes match the behaviour of `numpy.pad`.
+ absolute_scale (bool): Take `scale` as an absolute value rather than a relative value.
+ keypoints_threshold (float): Used as threshold in conversion from distance maps to keypoints.
+ The search for keypoints works by searching for the
+ argmin (non-inverted) or argmax (inverted) in each channel. This
+ parameters contains the maximum (non-inverted) or minimum (inverted) value to accept in order to view a hit
+ as a keypoint. Use ``None`` to use no min/max. Default: 0.01
+
+ Targets:
+ image, mask, keypoints, bboxes
+
+ Image types:
+ uint8, float32
+
+ """
+
+ def __init__(
+ self,
+ scale: ScaleFloatType = (0.03, 0.05),
+ nb_rows: Union[int, Sequence[int]] = 4,
+ nb_cols: Union[int, Sequence[int]] = 4,
+ interpolation: int = 1,
+ mask_interpolation: int = 0,
+ cval: int = 0,
+ cval_mask: int = 0,
+ mode: str = "constant",
+ absolute_scale: bool = False,
+ always_apply: bool = False,
+ keypoints_threshold: float = 0.01,
+ p: float = 0.5,
+ ):
+ super(PiecewiseAffine, self).__init__(always_apply, p)
+
+ self.scale = to_tuple(scale, scale)
+ self.nb_rows = to_tuple(nb_rows, nb_rows)
+ self.nb_cols = to_tuple(nb_cols, nb_cols)
+ self.interpolation = interpolation
+ self.mask_interpolation = mask_interpolation
+ self.cval = cval
+ self.cval_mask = cval_mask
+ self.mode = mode
+ self.absolute_scale = absolute_scale
+ self.keypoints_threshold = keypoints_threshold
+
+ def get_transform_init_args_names(self):
+ return (
+ "scale",
+ "nb_rows",
+ "nb_cols",
+ "interpolation",
+ "mask_interpolation",
+ "cval",
+ "cval_mask",
+ "mode",
+ "absolute_scale",
+ "keypoints_threshold",
+ )
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params) -> dict:
+ h, w = params["image"].shape[:2]
+
+ nb_rows = np.clip(random.randint(*self.nb_rows), 2, None)
+ nb_cols = np.clip(random.randint(*self.nb_cols), 2, None)
+ nb_cells = nb_cols * nb_rows
+ scale = random.uniform(*self.scale)
+
+ jitter: np.ndarray = random_utils.normal(0, scale, (nb_cells, 2))
+ if not np.any(jitter > 0):
+ for i in range(10): # See: https://github.com/albumentations-team/albumentations/issues/1442
+ jitter = random_utils.normal(0, scale, (nb_cells, 2))
+ if np.any(jitter > 0):
+ break
+ if not np.any(jitter > 0):
+ return {"matrix": None}
+
+ y = np.linspace(0, h, nb_rows)
+ x = np.linspace(0, w, nb_cols)
+
+ # (H, W) and (H, W) for H=rows, W=cols
+ xx_src, yy_src = np.meshgrid(x, y)
+
+ # (1, HW, 2) => (HW, 2) for H=rows, W=cols
+ points_src = np.dstack([yy_src.flat, xx_src.flat])[0]
+
+ if self.absolute_scale:
+ jitter[:, 0] = jitter[:, 0] / h if h > 0 else 0.0
+ jitter[:, 1] = jitter[:, 1] / w if w > 0 else 0.0
+
+ jitter[:, 0] = jitter[:, 0] * h
+ jitter[:, 1] = jitter[:, 1] * w
+
+ points_dest = np.copy(points_src)
+ points_dest[:, 0] = points_dest[:, 0] + jitter[:, 0]
+ points_dest[:, 1] = points_dest[:, 1] + jitter[:, 1]
+
+ # Restrict all destination points to be inside the image plane.
+ # This is necessary, as otherwise keypoints could be augmented
+ # outside of the image plane and these would be replaced by
+ # (-1, -1), which would not conform with the behaviour of the other augmenters.
+ points_dest[:, 0] = np.clip(points_dest[:, 0], 0, h - 1)
+ points_dest[:, 1] = np.clip(points_dest[:, 1], 0, w - 1)
+
+ matrix = skimage.transform.PiecewiseAffineTransform()
+ matrix.estimate(points_src[:, ::-1], points_dest[:, ::-1])
+
+ return {
+ "matrix": matrix,
+ }
+
+ def apply(
+ self, img: np.ndarray, matrix: Optional[skimage.transform.PiecewiseAffineTransform] = None, **params
+ ) -> np.ndarray:
+ return F.piecewise_affine(img, matrix, self.interpolation, self.mode, self.cval)
+
+ def apply_to_mask(
+ self, img: np.ndarray, matrix: Optional[skimage.transform.PiecewiseAffineTransform] = None, **params
+ ) -> np.ndarray:
+ return F.piecewise_affine(img, matrix, self.mask_interpolation, self.mode, self.cval_mask)
+
+ def apply_to_bbox(
+ self,
+ bbox: BoxInternalType,
+ rows: int = 0,
+ cols: int = 0,
+ matrix: Optional[skimage.transform.PiecewiseAffineTransform] = None,
+ **params
+ ) -> BoxInternalType:
+ return F.bbox_piecewise_affine(bbox, matrix, rows, cols, self.keypoints_threshold)
+
+ def apply_to_keypoint(
+ self,
+ keypoint: KeypointInternalType,
+ rows: int = 0,
+ cols: int = 0,
+ matrix: Optional[skimage.transform.PiecewiseAffineTransform] = None,
+ **params
+ ):
+ return F.keypoint_piecewise_affine(keypoint, matrix, rows, cols, self.keypoints_threshold)
+
+
+class PadIfNeeded(DualTransform):
+ """Pad side of the image / max if side is less than desired number.
+
+ Args:
+ min_height (int): minimal result image height.
+ min_width (int): minimal result image width.
+ pad_height_divisor (int): if not None, ensures image height is dividable by value of this argument.
+ pad_width_divisor (int): if not None, ensures image width is dividable by value of this argument.
+ position (Union[str, PositionType]): Position of the image. should be PositionType.CENTER or
+ PositionType.TOP_LEFT or PositionType.TOP_RIGHT or PositionType.BOTTOM_LEFT or PositionType.BOTTOM_RIGHT.
+ or PositionType.RANDOM. Default: PositionType.CENTER.
+ border_mode (OpenCV flag): OpenCV border mode.
+ value (int, float, list of int, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of int,
+ list of float): padding value for mask if border_mode is cv2.BORDER_CONSTANT.
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image, mask, bbox, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ class PositionType(Enum):
+ CENTER = "center"
+ TOP_LEFT = "top_left"
+ TOP_RIGHT = "top_right"
+ BOTTOM_LEFT = "bottom_left"
+ BOTTOM_RIGHT = "bottom_right"
+ RANDOM = "random"
+
+ def __init__(
+ self,
+ min_height: Optional[int] = 1024,
+ min_width: Optional[int] = 1024,
+ pad_height_divisor: Optional[int] = None,
+ pad_width_divisor: Optional[int] = None,
+ position: Union[PositionType, str] = PositionType.CENTER,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+ mask_value: Optional[ImageColorType] = None,
+ always_apply: bool = False,
+ p: float = 1.0,
+ ):
+ if (min_height is None) == (pad_height_divisor is None):
+ raise ValueError("Only one of 'min_height' and 'pad_height_divisor' parameters must be set")
+
+ if (min_width is None) == (pad_width_divisor is None):
+ raise ValueError("Only one of 'min_width' and 'pad_width_divisor' parameters must be set")
+
+ super(PadIfNeeded, self).__init__(always_apply, p)
+ self.min_height = min_height
+ self.min_width = min_width
+ self.pad_width_divisor = pad_width_divisor
+ self.pad_height_divisor = pad_height_divisor
+ self.position = PadIfNeeded.PositionType(position)
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+
+ def update_params(self, params, **kwargs):
+ params = super(PadIfNeeded, self).update_params(params, **kwargs)
+ rows = params["rows"]
+ cols = params["cols"]
+
+ if self.min_height is not None:
+ if rows < self.min_height:
+ h_pad_top = int((self.min_height - rows) / 2.0)
+ h_pad_bottom = self.min_height - rows - h_pad_top
+ else:
+ h_pad_top = 0
+ h_pad_bottom = 0
+ else:
+ pad_remained = rows % self.pad_height_divisor
+ pad_rows = self.pad_height_divisor - pad_remained if pad_remained > 0 else 0
+
+ h_pad_top = pad_rows // 2
+ h_pad_bottom = pad_rows - h_pad_top
+
+ if self.min_width is not None:
+ if cols < self.min_width:
+ w_pad_left = int((self.min_width - cols) / 2.0)
+ w_pad_right = self.min_width - cols - w_pad_left
+ else:
+ w_pad_left = 0
+ w_pad_right = 0
+ else:
+ pad_remainder = cols % self.pad_width_divisor
+ pad_cols = self.pad_width_divisor - pad_remainder if pad_remainder > 0 else 0
+
+ w_pad_left = pad_cols // 2
+ w_pad_right = pad_cols - w_pad_left
+
+ h_pad_top, h_pad_bottom, w_pad_left, w_pad_right = self.__update_position_params(
+ h_top=h_pad_top, h_bottom=h_pad_bottom, w_left=w_pad_left, w_right=w_pad_right
+ )
+
+ params.update(
+ {
+ "pad_top": h_pad_top,
+ "pad_bottom": h_pad_bottom,
+ "pad_left": w_pad_left,
+ "pad_right": w_pad_right,
+ }
+ )
+ return params
+
+ def apply(
+ self, img: np.ndarray, pad_top: int = 0, pad_bottom: int = 0, pad_left: int = 0, pad_right: int = 0, **params
+ ) -> np.ndarray:
+ return F.pad_with_params(
+ img,
+ pad_top,
+ pad_bottom,
+ pad_left,
+ pad_right,
+ border_mode=self.border_mode,
+ value=self.value,
+ )
+
+ def apply_to_mask(
+ self, img: np.ndarray, pad_top: int = 0, pad_bottom: int = 0, pad_left: int = 0, pad_right: int = 0, **params
+ ) -> np.ndarray:
+ return F.pad_with_params(
+ img,
+ pad_top,
+ pad_bottom,
+ pad_left,
+ pad_right,
+ border_mode=self.border_mode,
+ value=self.mask_value,
+ )
+
+ def apply_to_bbox(
+ self,
+ bbox: BoxInternalType,
+ pad_top: int = 0,
+ pad_bottom: int = 0,
+ pad_left: int = 0,
+ pad_right: int = 0,
+ rows: int = 0,
+ cols: int = 0,
+ **params
+ ) -> BoxInternalType:
+ x_min, y_min, x_max, y_max = denormalize_bbox(bbox, rows, cols)[:4]
+ bbox = x_min + pad_left, y_min + pad_top, x_max + pad_left, y_max + pad_top
+ return normalize_bbox(bbox, rows + pad_top + pad_bottom, cols + pad_left + pad_right)
+
+ def apply_to_keypoint(
+ self,
+ keypoint: KeypointInternalType,
+ pad_top: int = 0,
+ pad_bottom: int = 0,
+ pad_left: int = 0,
+ pad_right: int = 0,
+ **params
+ ) -> KeypointInternalType:
+ x, y, angle, scale = keypoint[:4]
+ return x + pad_left, y + pad_top, angle, scale
+
+ def get_transform_init_args_names(self):
+ return (
+ "min_height",
+ "min_width",
+ "pad_height_divisor",
+ "pad_width_divisor",
+ "border_mode",
+ "value",
+ "mask_value",
+ )
+
+ def __update_position_params(
+ self, h_top: int, h_bottom: int, w_left: int, w_right: int
+ ) -> Tuple[int, int, int, int]:
+ if self.position == PadIfNeeded.PositionType.TOP_LEFT:
+ h_bottom += h_top
+ w_right += w_left
+ h_top = 0
+ w_left = 0
+
+ elif self.position == PadIfNeeded.PositionType.TOP_RIGHT:
+ h_bottom += h_top
+ w_left += w_right
+ h_top = 0
+ w_right = 0
+
+ elif self.position == PadIfNeeded.PositionType.BOTTOM_LEFT:
+ h_top += h_bottom
+ w_right += w_left
+ h_bottom = 0
+ w_left = 0
+
+ elif self.position == PadIfNeeded.PositionType.BOTTOM_RIGHT:
+ h_top += h_bottom
+ w_left += w_right
+ h_bottom = 0
+ w_right = 0
+
+ elif self.position == PadIfNeeded.PositionType.RANDOM:
+ h_pad = h_top + h_bottom
+ w_pad = w_left + w_right
+ h_top = random.randint(0, h_pad)
+ h_bottom = h_pad - h_top
+ w_left = random.randint(0, w_pad)
+ w_right = w_pad - w_left
+
+ return h_top, h_bottom, w_left, w_right
+
+
+class VerticalFlip(DualTransform):
+ """Flip the input vertically around the x-axis.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img: np.ndarray, **params) -> np.ndarray:
+ return F.vflip(img)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return F.bbox_vflip(bbox, **params)
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, **params) -> KeypointInternalType:
+ return F.keypoint_vflip(keypoint, **params)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class HorizontalFlip(DualTransform):
+ """Flip the input horizontally around the y-axis.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img: np.ndarray, **params) -> np.ndarray:
+ if img.ndim == 3 and img.shape[2] > 1 and img.dtype == np.uint8:
+ # Opencv is faster than numpy only in case of
+ # non-gray scale 8bits images
+ return F.hflip_cv2(img)
+
+ return F.hflip(img)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return F.bbox_hflip(bbox, **params)
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, **params) -> KeypointInternalType:
+ return F.keypoint_hflip(keypoint, **params)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class Flip(DualTransform):
+ """Flip the input either horizontally, vertically or both horizontally and vertically.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img: np.ndarray, d: int = 0, **params) -> np.ndarray:
+ """Args:
+ d (int): code that specifies how to flip the input. 0 for vertical flipping, 1 for horizontal flipping,
+ -1 for both vertical and horizontal flipping (which is also could be seen as rotating the input by
+ 180 degrees).
+ """
+ return F.random_flip(img, d)
+
+ def get_params(self):
+ # Random int in the range [-1, 1]
+ return {"d": random.randint(-1, 1)}
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return F.bbox_flip(bbox, **params)
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, **params) -> KeypointInternalType:
+ return F.keypoint_flip(keypoint, **params)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class Transpose(DualTransform):
+ """Transpose the input by swapping rows and columns.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img: np.ndarray, **params) -> np.ndarray:
+ return F.transpose(img)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return F.bbox_transpose(bbox, 0, **params)
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, **params) -> KeypointInternalType:
+ return F.keypoint_transpose(keypoint)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class OpticalDistortion(DualTransform):
+ """
+ Args:
+ distort_limit (float, (float, float)): If distort_limit is a single float, the range
+ will be (-distort_limit, distort_limit). Default: (-0.05, 0.05).
+ shift_limit (float, (float, float))): If shift_limit is a single float, the range
+ will be (-shift_limit, shift_limit). Default: (-0.05, 0.05).
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ border_mode (OpenCV flag): flag that is used to specify the pixel extrapolation method. Should be one of:
+ cv2.BORDER_CONSTANT, cv2.BORDER_REPLICATE, cv2.BORDER_REFLECT, cv2.BORDER_WRAP, cv2.BORDER_REFLECT_101.
+ Default: cv2.BORDER_REFLECT_101
+ value (int, float, list of ints, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of ints,
+ list of float): padding value if border_mode is cv2.BORDER_CONSTANT applied for masks.
+
+ Targets:
+ image, mask, bbox
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ distort_limit: ScaleFloatType = 0.05,
+ shift_limit: ScaleFloatType = 0.05,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+ mask_value: Optional[ImageColorType] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(OpticalDistortion, self).__init__(always_apply, p)
+ self.shift_limit = to_tuple(shift_limit)
+ self.distort_limit = to_tuple(distort_limit)
+ self.interpolation = interpolation
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+
+ def apply(
+ self, img: np.ndarray, k: int = 0, dx: int = 0, dy: int = 0, interpolation: int = cv2.INTER_LINEAR, **params
+ ) -> np.ndarray:
+ return F.optical_distortion(img, k, dx, dy, interpolation, self.border_mode, self.value)
+
+ def apply_to_mask(self, img: np.ndarray, k: int = 0, dx: int = 0, dy: int = 0, **params) -> np.ndarray:
+ return F.optical_distortion(img, k, dx, dy, cv2.INTER_NEAREST, self.border_mode, self.mask_value)
+
+ def apply_to_bbox(self, bbox: BoxInternalType, k: int = 0, dx: int = 0, dy: int = 0, **params) -> BoxInternalType:
+ rows, cols = params["rows"], params["cols"]
+ mask = np.zeros((rows, cols), dtype=np.uint8)
+ bbox_denorm = F.denormalize_bbox(bbox, rows, cols)
+ x_min, y_min, x_max, y_max = bbox_denorm[:4]
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+ mask[y_min:y_max, x_min:x_max] = 1
+ mask = F.optical_distortion(mask, k, dx, dy, cv2.INTER_NEAREST, self.border_mode, self.mask_value)
+ bbox_returned = bbox_from_mask(mask)
+ bbox_returned = F.normalize_bbox(bbox_returned, rows, cols)
+ return bbox_returned
+
+ def get_params(self):
+ return {
+ "k": random.uniform(self.distort_limit[0], self.distort_limit[1]),
+ "dx": round(random.uniform(self.shift_limit[0], self.shift_limit[1])),
+ "dy": round(random.uniform(self.shift_limit[0], self.shift_limit[1])),
+ }
+
+ def get_transform_init_args_names(self):
+ return (
+ "distort_limit",
+ "shift_limit",
+ "interpolation",
+ "border_mode",
+ "value",
+ "mask_value",
+ )
+
+
+class GridDistortion(DualTransform):
+ """
+ Args:
+ num_steps (int): count of grid cells on each side.
+ distort_limit (float, (float, float)): If distort_limit is a single float, the range
+ will be (-distort_limit, distort_limit). Default: (-0.03, 0.03).
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ border_mode (OpenCV flag): flag that is used to specify the pixel extrapolation method. Should be one of:
+ cv2.BORDER_CONSTANT, cv2.BORDER_REPLICATE, cv2.BORDER_REFLECT, cv2.BORDER_WRAP, cv2.BORDER_REFLECT_101.
+ Default: cv2.BORDER_REFLECT_101
+ value (int, float, list of ints, list of float): padding value if border_mode is cv2.BORDER_CONSTANT.
+ mask_value (int, float,
+ list of ints,
+ list of float): padding value if border_mode is cv2.BORDER_CONSTANT applied for masks.
+ normalized (bool): if true, distortion will be normalized to do not go outside the image. Default: False
+ See for more information: https://github.com/albumentations-team/albumentations/pull/722
+
+ Targets:
+ image, mask
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ num_steps: int = 5,
+ distort_limit: ScaleFloatType = 0.3,
+ interpolation: int = cv2.INTER_LINEAR,
+ border_mode: int = cv2.BORDER_REFLECT_101,
+ value: Optional[ImageColorType] = None,
+ mask_value: Optional[ImageColorType] = None,
+ normalized: bool = False,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(GridDistortion, self).__init__(always_apply, p)
+ self.num_steps = num_steps
+ self.distort_limit = to_tuple(distort_limit)
+ self.interpolation = interpolation
+ self.border_mode = border_mode
+ self.value = value
+ self.mask_value = mask_value
+ self.normalized = normalized
+
+ def apply(
+ self, img: np.ndarray, stepsx: Tuple = (), stepsy: Tuple = (), interpolation: int = cv2.INTER_LINEAR, **params
+ ) -> np.ndarray:
+ return F.grid_distortion(img, self.num_steps, stepsx, stepsy, interpolation, self.border_mode, self.value)
+
+ def apply_to_mask(self, img: np.ndarray, stepsx: Tuple = (), stepsy: Tuple = (), **params) -> np.ndarray:
+ return F.grid_distortion(
+ img, self.num_steps, stepsx, stepsy, cv2.INTER_NEAREST, self.border_mode, self.mask_value
+ )
+
+ def apply_to_bbox(self, bbox: BoxInternalType, stepsx: Tuple = (), stepsy: Tuple = (), **params) -> BoxInternalType:
+ rows, cols = params["rows"], params["cols"]
+ mask = np.zeros((rows, cols), dtype=np.uint8)
+ bbox_denorm = F.denormalize_bbox(bbox, rows, cols)
+ x_min, y_min, x_max, y_max = bbox_denorm[:4]
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+ mask[y_min:y_max, x_min:x_max] = 1
+ mask = F.grid_distortion(
+ mask, self.num_steps, stepsx, stepsy, cv2.INTER_NEAREST, self.border_mode, self.mask_value
+ )
+ bbox_returned = bbox_from_mask(mask)
+ bbox_returned = F.normalize_bbox(bbox_returned, rows, cols)
+ return bbox_returned
+
+ def _normalize(self, h, w, xsteps, ysteps):
+ # compensate for smaller last steps in source image.
+ x_step = w // self.num_steps
+ last_x_step = min(w, ((self.num_steps + 1) * x_step)) - (self.num_steps * x_step)
+ xsteps[-1] *= last_x_step / x_step
+
+ y_step = h // self.num_steps
+ last_y_step = min(h, ((self.num_steps + 1) * y_step)) - (self.num_steps * y_step)
+ ysteps[-1] *= last_y_step / y_step
+
+ # now normalize such that distortion never leaves image bounds.
+ tx = w / math.floor(w / self.num_steps)
+ ty = h / math.floor(h / self.num_steps)
+ xsteps = np.array(xsteps) * (tx / np.sum(xsteps))
+ ysteps = np.array(ysteps) * (ty / np.sum(ysteps))
+
+ return {"stepsx": xsteps, "stepsy": ysteps}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ h, w = params["image"].shape[:2]
+
+ stepsx = [1 + random.uniform(self.distort_limit[0], self.distort_limit[1]) for _ in range(self.num_steps + 1)]
+ stepsy = [1 + random.uniform(self.distort_limit[0], self.distort_limit[1]) for _ in range(self.num_steps + 1)]
+
+ if self.normalized:
+ return self._normalize(h, w, stepsx, stepsy)
+
+ return {"stepsx": stepsx, "stepsy": stepsy}
+
+ def get_transform_init_args_names(self):
+ return "num_steps", "distort_limit", "interpolation", "border_mode", "value", "mask_value", "normalized"
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/transforms.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..368e23e06962457b36696327f7366f70a038c16c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/transforms.py
@@ -0,0 +1,2667 @@
+from __future__ import absolute_import, division
+
+import math
+import numbers
+import random
+import warnings
+from enum import IntEnum
+from types import LambdaType
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+from scipy import special
+from scipy.ndimage import gaussian_filter
+
+from custom_albumentations import random_utils
+from custom_albumentations.augmentations.blur.functional import blur
+from custom_albumentations.augmentations.utils import (
+ get_num_channels,
+ is_grayscale_image,
+ is_rgb_image,
+)
+
+from ..core.transforms_interface import (
+ DualTransform,
+ ImageOnlyTransform,
+ NoOp,
+ ScaleFloatType,
+ to_tuple,
+)
+from ..core.utils import format_args
+from . import functional as F
+
+__all__ = [
+ "Normalize",
+ "RandomGamma",
+ "RandomGridShuffle",
+ "HueSaturationValue",
+ "RGBShift",
+ "RandomBrightness",
+ "RandomContrast",
+ "GaussNoise",
+ "CLAHE",
+ "ChannelShuffle",
+ "InvertImg",
+ "ToGray",
+ "ToRGB",
+ "ToSepia",
+ "JpegCompression",
+ "ImageCompression",
+ "ToFloat",
+ "FromFloat",
+ "RandomBrightnessContrast",
+ "RandomSnow",
+ "RandomGravel",
+ "RandomRain",
+ "RandomFog",
+ "RandomSunFlare",
+ "RandomShadow",
+ "RandomToneCurve",
+ "Lambda",
+ "ISONoise",
+ "Solarize",
+ "Equalize",
+ "Posterize",
+ "Downscale",
+ "MultiplicativeNoise",
+ "FancyPCA",
+ "ColorJitter",
+ "Sharpen",
+ "Emboss",
+ "Superpixels",
+ "TemplateTransform",
+ "RingingOvershoot",
+ "UnsharpMask",
+ "PixelDropout",
+ "Spatter",
+]
+
+
+class RandomGridShuffle(DualTransform):
+ """
+ Random shuffle grid's cells on image.
+
+ Args:
+ grid ((int, int)): size of grid for splitting image.
+
+ Targets:
+ image, mask, keypoints
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, grid: Tuple[int, int] = (3, 3), always_apply: bool = False, p: float = 0.5):
+ super(RandomGridShuffle, self).__init__(always_apply, p)
+ self.grid = grid
+
+ def apply(self, img: np.ndarray, tiles: np.ndarray = np.array(None), **params):
+ return F.swap_tiles_on_image(img, tiles)
+
+ def apply_to_mask(self, img: np.ndarray, tiles: np.ndarray = np.array(None), **params):
+ return F.swap_tiles_on_image(img, tiles)
+
+ def apply_to_keypoint(
+ self, keypoint: Tuple[float, ...], tiles: np.ndarray = np.array(None), rows: int = 0, cols: int = 0, **params
+ ):
+ for (
+ current_left_up_corner_row,
+ current_left_up_corner_col,
+ old_left_up_corner_row,
+ old_left_up_corner_col,
+ height_tile,
+ width_tile,
+ ) in tiles:
+ x, y = keypoint[:2]
+
+ if (old_left_up_corner_row <= y < (old_left_up_corner_row + height_tile)) and (
+ old_left_up_corner_col <= x < (old_left_up_corner_col + width_tile)
+ ):
+ x = x - old_left_up_corner_col + current_left_up_corner_col
+ y = y - old_left_up_corner_row + current_left_up_corner_row
+ keypoint = (x, y) + tuple(keypoint[2:])
+ break
+
+ return keypoint
+
+ def get_params_dependent_on_targets(self, params):
+ height, width = params["image"].shape[:2]
+ n, m = self.grid
+
+ if n <= 0 or m <= 0:
+ raise ValueError("Grid's values must be positive. Current grid [%s, %s]" % (n, m))
+
+ if n > height // 2 or m > width // 2:
+ raise ValueError("Incorrect size cell of grid. Just shuffle pixels of image")
+
+ height_split = np.linspace(0, height, n + 1, dtype=np.int32)
+ width_split = np.linspace(0, width, m + 1, dtype=np.int32)
+
+ height_matrix, width_matrix = np.meshgrid(height_split, width_split, indexing="ij")
+
+ index_height_matrix = height_matrix[:-1, :-1]
+ index_width_matrix = width_matrix[:-1, :-1]
+
+ shifted_index_height_matrix = height_matrix[1:, 1:]
+ shifted_index_width_matrix = width_matrix[1:, 1:]
+
+ height_tile_sizes = shifted_index_height_matrix - index_height_matrix
+ width_tile_sizes = shifted_index_width_matrix - index_width_matrix
+
+ tiles_sizes = np.stack((height_tile_sizes, width_tile_sizes), axis=2)
+
+ index_matrix = np.indices((n, m))
+ new_index_matrix = np.stack(index_matrix, axis=2)
+
+ for bbox_size in np.unique(tiles_sizes.reshape(-1, 2), axis=0):
+ eq_mat = np.all(tiles_sizes == bbox_size, axis=2)
+ new_index_matrix[eq_mat] = random_utils.permutation(new_index_matrix[eq_mat])
+
+ new_index_matrix = np.split(new_index_matrix, 2, axis=2)
+
+ old_x = index_height_matrix[new_index_matrix[0], new_index_matrix[1]].reshape(-1)
+ old_y = index_width_matrix[new_index_matrix[0], new_index_matrix[1]].reshape(-1)
+
+ shift_x = height_tile_sizes.reshape(-1)
+ shift_y = width_tile_sizes.reshape(-1)
+
+ curr_x = index_height_matrix.reshape(-1)
+ curr_y = index_width_matrix.reshape(-1)
+
+ tiles = np.stack([curr_x, curr_y, old_x, old_y, shift_x, shift_y], axis=1)
+
+ return {"tiles": tiles}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return ("grid",)
+
+
+class Normalize(ImageOnlyTransform):
+ """Normalization is applied by the formula: `img = (img - mean * max_pixel_value) / (std * max_pixel_value)`
+
+ Args:
+ mean (float, list of float): mean values
+ std (float, list of float): std values
+ max_pixel_value (float): maximum possible pixel value
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ mean=(0.485, 0.456, 0.406),
+ std=(0.229, 0.224, 0.225),
+ max_pixel_value=255.0,
+ always_apply=False,
+ p=1.0,
+ ):
+ super(Normalize, self).__init__(always_apply, p)
+ self.mean = mean
+ self.std = std
+ self.max_pixel_value = max_pixel_value
+
+ def apply(self, image, **params):
+ return F.normalize(image, self.mean, self.std, self.max_pixel_value)
+
+ def get_transform_init_args_names(self):
+ return ("mean", "std", "max_pixel_value")
+
+
+class ImageCompression(ImageOnlyTransform):
+ """Decreases image quality by Jpeg, WebP compression of an image.
+
+ Args:
+ quality_lower (float): lower bound on the image quality.
+ Should be in [0, 100] range for jpeg and [1, 100] for webp.
+ quality_upper (float): upper bound on the image quality.
+ Should be in [0, 100] range for jpeg and [1, 100] for webp.
+ compression_type (ImageCompressionType): should be ImageCompressionType.JPEG or ImageCompressionType.WEBP.
+ Default: ImageCompressionType.JPEG
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ class ImageCompressionType(IntEnum):
+ JPEG = 0
+ WEBP = 1
+
+ def __init__(
+ self,
+ quality_lower=99,
+ quality_upper=100,
+ compression_type=ImageCompressionType.JPEG,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(ImageCompression, self).__init__(always_apply, p)
+
+ self.compression_type = ImageCompression.ImageCompressionType(compression_type)
+ low_thresh_quality_assert = 0
+
+ if self.compression_type == ImageCompression.ImageCompressionType.WEBP:
+ low_thresh_quality_assert = 1
+
+ if not low_thresh_quality_assert <= quality_lower <= 100:
+ raise ValueError("Invalid quality_lower. Got: {}".format(quality_lower))
+ if not low_thresh_quality_assert <= quality_upper <= 100:
+ raise ValueError("Invalid quality_upper. Got: {}".format(quality_upper))
+
+ self.quality_lower = quality_lower
+ self.quality_upper = quality_upper
+
+ def apply(self, image, quality=100, image_type=".jpg", **params):
+ if not image.ndim == 2 and image.shape[-1] not in (1, 3, 4):
+ raise TypeError("ImageCompression transformation expects 1, 3 or 4 channel images.")
+ return F.image_compression(image, quality, image_type)
+
+ def get_params(self):
+ image_type = ".jpg"
+
+ if self.compression_type == ImageCompression.ImageCompressionType.WEBP:
+ image_type = ".webp"
+
+ return {
+ "quality": random.randint(self.quality_lower, self.quality_upper),
+ "image_type": image_type,
+ }
+
+ def get_transform_init_args(self):
+ return {
+ "quality_lower": self.quality_lower,
+ "quality_upper": self.quality_upper,
+ "compression_type": self.compression_type.value,
+ }
+
+
+class JpegCompression(ImageCompression):
+ """Decreases image quality by Jpeg compression of an image.
+
+ Args:
+ quality_lower (float): lower bound on the jpeg quality. Should be in [0, 100] range
+ quality_upper (float): upper bound on the jpeg quality. Should be in [0, 100] range
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, quality_lower=99, quality_upper=100, always_apply=False, p=0.5):
+ super(JpegCompression, self).__init__(
+ quality_lower=quality_lower,
+ quality_upper=quality_upper,
+ compression_type=ImageCompression.ImageCompressionType.JPEG,
+ always_apply=always_apply,
+ p=p,
+ )
+ warnings.warn(
+ f"{self.__class__.__name__} has been deprecated. Please use ImageCompression",
+ FutureWarning,
+ )
+
+ def get_transform_init_args(self):
+ return {
+ "quality_lower": self.quality_lower,
+ "quality_upper": self.quality_upper,
+ }
+
+
+class RandomSnow(ImageOnlyTransform):
+ """Bleach out some pixel values simulating snow.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ snow_point_lower (float): lower_bond of the amount of snow. Should be in [0, 1] range
+ snow_point_upper (float): upper_bond of the amount of snow. Should be in [0, 1] range
+ brightness_coeff (float): larger number will lead to a more snow on the image. Should be >= 0
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ snow_point_lower=0.1,
+ snow_point_upper=0.3,
+ brightness_coeff=2.5,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomSnow, self).__init__(always_apply, p)
+
+ if not 0 <= snow_point_lower <= snow_point_upper <= 1:
+ raise ValueError(
+ "Invalid combination of snow_point_lower and snow_point_upper. Got: {}".format(
+ (snow_point_lower, snow_point_upper)
+ )
+ )
+ if brightness_coeff < 0:
+ raise ValueError("brightness_coeff must be greater than 0. Got: {}".format(brightness_coeff))
+
+ self.snow_point_lower = snow_point_lower
+ self.snow_point_upper = snow_point_upper
+ self.brightness_coeff = brightness_coeff
+
+ def apply(self, image, snow_point=0.1, **params):
+ return F.add_snow(image, snow_point, self.brightness_coeff)
+
+ def get_params(self):
+ return {"snow_point": random.uniform(self.snow_point_lower, self.snow_point_upper)}
+
+ def get_transform_init_args_names(self):
+ return ("snow_point_lower", "snow_point_upper", "brightness_coeff")
+
+
+class RandomGravel(ImageOnlyTransform):
+ """Add gravels.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ gravel_roi (float, float, float, float): (top-left x, top-left y,
+ bottom-right x, bottom right y). Should be in [0, 1] range
+ number_of_patches (int): no. of gravel patches required
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ gravel_roi: tuple = (0.1, 0.4, 0.9, 0.9),
+ number_of_patches: int = 2,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(RandomGravel, self).__init__(always_apply, p)
+
+ (gravel_lower_x, gravel_lower_y, gravel_upper_x, gravel_upper_y) = gravel_roi
+
+ if not 0 <= gravel_lower_x < gravel_upper_x <= 1 or not 0 <= gravel_lower_y < gravel_upper_y <= 1:
+ raise ValueError("Invalid gravel_roi. Got: %s." % gravel_roi)
+ if number_of_patches < 1:
+ raise ValueError("Invalid gravel number_of_patches. Got: %s." % number_of_patches)
+
+ self.gravel_roi = gravel_roi
+ self.number_of_patches = number_of_patches
+
+ def generate_gravel_patch(self, rectangular_roi):
+ x1, y1, x2, y2 = rectangular_roi
+ gravels = []
+ area = abs((x2 - x1) * (y2 - y1))
+ count = area // 10
+ gravels = np.empty([count, 2], dtype=np.int64)
+ gravels[:, 0] = random_utils.randint(x1, x2, count)
+ gravels[:, 1] = random_utils.randint(y1, y2, count)
+ return gravels
+
+ def apply(self, image, gravels_infos=(), **params):
+ return F.add_gravel(image, gravels_infos)
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ height, width = img.shape[:2]
+
+ x_min, y_min, x_max, y_max = self.gravel_roi
+ x_min = int(x_min * width)
+ x_max = int(x_max * width)
+ y_min = int(y_min * height)
+ y_max = int(y_max * height)
+
+ max_height = 200
+ max_width = 30
+
+ rectangular_rois = np.zeros([self.number_of_patches, 4], dtype=np.int64)
+ xx1 = random_utils.randint(x_min + 1, x_max, self.number_of_patches) # xmax
+ xx2 = random_utils.randint(x_min, xx1) # xmin
+ yy1 = random_utils.randint(y_min + 1, y_max, self.number_of_patches) # ymax
+ yy2 = random_utils.randint(y_min, yy1) # ymin
+
+ rectangular_rois[:, 0] = xx2
+ rectangular_rois[:, 1] = yy2
+ rectangular_rois[:, 2] = [min(tup) for tup in zip(xx1, xx2 + max_height)]
+ rectangular_rois[:, 3] = [min(tup) for tup in zip(yy1, yy2 + max_width)]
+
+ minx = []
+ maxx = []
+ miny = []
+ maxy = []
+ val = []
+ for roi in rectangular_rois:
+ gravels = self.generate_gravel_patch(roi)
+ x = gravels[:, 0]
+ y = gravels[:, 1]
+ r = random_utils.randint(1, 4, len(gravels))
+ sat = random_utils.randint(0, 255, len(gravels))
+ miny.append(np.maximum(y - r, 0))
+ maxy.append(np.minimum(y + r, y))
+ minx.append(np.maximum(x - r, 0))
+ maxx.append(np.minimum(x + r, x))
+ val.append(sat)
+
+ return {
+ "gravels_infos": np.stack(
+ [
+ np.concatenate(miny),
+ np.concatenate(maxy),
+ np.concatenate(minx),
+ np.concatenate(maxx),
+ np.concatenate(val),
+ ],
+ 1,
+ )
+ }
+
+ def get_transform_init_args_names(self):
+ return {"gravel_roi": self.gravel_roi, "number_of_patches": self.number_of_patches}
+
+
+class RandomRain(ImageOnlyTransform):
+ """Adds rain effects.
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ slant_lower: should be in range [-20, 20].
+ slant_upper: should be in range [-20, 20].
+ drop_length: should be in range [0, 100].
+ drop_width: should be in range [1, 5].
+ drop_color (list of (r, g, b)): rain lines color.
+ blur_value (int): rainy view are blurry
+ brightness_coefficient (float): rainy days are usually shady. Should be in range [0, 1].
+ rain_type: One of [None, "drizzle", "heavy", "torrential"]
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ slant_lower=-10,
+ slant_upper=10,
+ drop_length=20,
+ drop_width=1,
+ drop_color=(200, 200, 200),
+ blur_value=7,
+ brightness_coefficient=0.7,
+ rain_type=None,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomRain, self).__init__(always_apply, p)
+
+ if rain_type not in ["drizzle", "heavy", "torrential", None]:
+ raise ValueError(
+ "raint_type must be one of ({}). Got: {}".format(["drizzle", "heavy", "torrential", None], rain_type)
+ )
+ if not -20 <= slant_lower <= slant_upper <= 20:
+ raise ValueError(
+ "Invalid combination of slant_lower and slant_upper. Got: {}".format((slant_lower, slant_upper))
+ )
+ if not 1 <= drop_width <= 5:
+ raise ValueError("drop_width must be in range [1, 5]. Got: {}".format(drop_width))
+ if not 0 <= drop_length <= 100:
+ raise ValueError("drop_length must be in range [0, 100]. Got: {}".format(drop_length))
+ if not 0 <= brightness_coefficient <= 1:
+ raise ValueError("brightness_coefficient must be in range [0, 1]. Got: {}".format(brightness_coefficient))
+
+ self.slant_lower = slant_lower
+ self.slant_upper = slant_upper
+
+ self.drop_length = drop_length
+ self.drop_width = drop_width
+ self.drop_color = drop_color
+ self.blur_value = blur_value
+ self.brightness_coefficient = brightness_coefficient
+ self.rain_type = rain_type
+
+ def apply(self, image, slant=10, drop_length=20, rain_drops=(), **params):
+ return F.add_rain(
+ image,
+ slant,
+ drop_length,
+ self.drop_width,
+ self.drop_color,
+ self.blur_value,
+ self.brightness_coefficient,
+ rain_drops,
+ )
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ slant = int(random.uniform(self.slant_lower, self.slant_upper))
+
+ height, width = img.shape[:2]
+ area = height * width
+
+ if self.rain_type == "drizzle":
+ num_drops = area // 770
+ drop_length = 10
+ elif self.rain_type == "heavy":
+ num_drops = width * height // 600
+ drop_length = 30
+ elif self.rain_type == "torrential":
+ num_drops = area // 500
+ drop_length = 60
+ else:
+ drop_length = self.drop_length
+ num_drops = area // 600
+
+ rain_drops = []
+
+ for _i in range(num_drops): # If You want heavy rain, try increasing this
+ if slant < 0:
+ x = random.randint(slant, width)
+ else:
+ x = random.randint(0, width - slant)
+
+ y = random.randint(0, height - drop_length)
+
+ rain_drops.append((x, y))
+
+ return {"drop_length": drop_length, "slant": slant, "rain_drops": rain_drops}
+
+ def get_transform_init_args_names(self):
+ return (
+ "slant_lower",
+ "slant_upper",
+ "drop_length",
+ "drop_width",
+ "drop_color",
+ "blur_value",
+ "brightness_coefficient",
+ "rain_type",
+ )
+
+
+class RandomFog(ImageOnlyTransform):
+ """Simulates fog for the image
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ fog_coef_lower (float): lower limit for fog intensity coefficient. Should be in [0, 1] range.
+ fog_coef_upper (float): upper limit for fog intensity coefficient. Should be in [0, 1] range.
+ alpha_coef (float): transparency of the fog circles. Should be in [0, 1] range.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ fog_coef_lower=0.3,
+ fog_coef_upper=1,
+ alpha_coef=0.08,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomFog, self).__init__(always_apply, p)
+
+ if not 0 <= fog_coef_lower <= fog_coef_upper <= 1:
+ raise ValueError(
+ "Invalid combination if fog_coef_lower and fog_coef_upper. Got: {}".format(
+ (fog_coef_lower, fog_coef_upper)
+ )
+ )
+ if not 0 <= alpha_coef <= 1:
+ raise ValueError("alpha_coef must be in range [0, 1]. Got: {}".format(alpha_coef))
+
+ self.fog_coef_lower = fog_coef_lower
+ self.fog_coef_upper = fog_coef_upper
+ self.alpha_coef = alpha_coef
+
+ def apply(self, image, fog_coef=0.1, haze_list=(), **params):
+ return F.add_fog(image, fog_coef, self.alpha_coef, haze_list)
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ fog_coef = random.uniform(self.fog_coef_lower, self.fog_coef_upper)
+
+ height, width = imshape = img.shape[:2]
+
+ hw = max(1, int(width // 3 * fog_coef))
+
+ haze_list = []
+ midx = width // 2 - 2 * hw
+ midy = height // 2 - hw
+ index = 1
+
+ while midx > -hw or midy > -hw:
+ for _i in range(hw // 10 * index):
+ x = random.randint(midx, width - midx - hw)
+ y = random.randint(midy, height - midy - hw)
+ haze_list.append((x, y))
+
+ midx -= 3 * hw * width // sum(imshape)
+ midy -= 3 * hw * height // sum(imshape)
+ index += 1
+
+ return {"haze_list": haze_list, "fog_coef": fog_coef}
+
+ def get_transform_init_args_names(self):
+ return ("fog_coef_lower", "fog_coef_upper", "alpha_coef")
+
+
+class RandomSunFlare(ImageOnlyTransform):
+ """Simulates Sun Flare for the image
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ flare_roi (float, float, float, float): region of the image where flare will
+ appear (x_min, y_min, x_max, y_max). All values should be in range [0, 1].
+ angle_lower (float): should be in range [0, `angle_upper`].
+ angle_upper (float): should be in range [`angle_lower`, 1].
+ num_flare_circles_lower (int): lower limit for the number of flare circles.
+ Should be in range [0, `num_flare_circles_upper`].
+ num_flare_circles_upper (int): upper limit for the number of flare circles.
+ Should be in range [`num_flare_circles_lower`, inf].
+ src_radius (int):
+ src_color ((int, int, int)): color of the flare
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ flare_roi=(0, 0, 1, 0.5),
+ angle_lower=0,
+ angle_upper=1,
+ num_flare_circles_lower=6,
+ num_flare_circles_upper=10,
+ src_radius=400,
+ src_color=(255, 255, 255),
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomSunFlare, self).__init__(always_apply, p)
+
+ (
+ flare_center_lower_x,
+ flare_center_lower_y,
+ flare_center_upper_x,
+ flare_center_upper_y,
+ ) = flare_roi
+
+ if (
+ not 0 <= flare_center_lower_x < flare_center_upper_x <= 1
+ or not 0 <= flare_center_lower_y < flare_center_upper_y <= 1
+ ):
+ raise ValueError("Invalid flare_roi. Got: {}".format(flare_roi))
+ if not 0 <= angle_lower < angle_upper <= 1:
+ raise ValueError(
+ "Invalid combination of angle_lower nad angle_upper. Got: {}".format((angle_lower, angle_upper))
+ )
+ if not 0 <= num_flare_circles_lower < num_flare_circles_upper:
+ raise ValueError(
+ "Invalid combination of num_flare_circles_lower nad num_flare_circles_upper. Got: {}".format(
+ (num_flare_circles_lower, num_flare_circles_upper)
+ )
+ )
+
+ self.flare_center_lower_x = flare_center_lower_x
+ self.flare_center_upper_x = flare_center_upper_x
+
+ self.flare_center_lower_y = flare_center_lower_y
+ self.flare_center_upper_y = flare_center_upper_y
+
+ self.angle_lower = angle_lower
+ self.angle_upper = angle_upper
+ self.num_flare_circles_lower = num_flare_circles_lower
+ self.num_flare_circles_upper = num_flare_circles_upper
+
+ self.src_radius = src_radius
+ self.src_color = src_color
+
+ def apply(self, image, flare_center_x=0.5, flare_center_y=0.5, circles=(), **params):
+ return F.add_sun_flare(
+ image,
+ flare_center_x,
+ flare_center_y,
+ self.src_radius,
+ self.src_color,
+ circles,
+ )
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ height, width = img.shape[:2]
+
+ angle = 2 * math.pi * random.uniform(self.angle_lower, self.angle_upper)
+
+ flare_center_x = random.uniform(self.flare_center_lower_x, self.flare_center_upper_x)
+ flare_center_y = random.uniform(self.flare_center_lower_y, self.flare_center_upper_y)
+
+ flare_center_x = int(width * flare_center_x)
+ flare_center_y = int(height * flare_center_y)
+
+ num_circles = random.randint(self.num_flare_circles_lower, self.num_flare_circles_upper)
+
+ circles = []
+
+ x = []
+ y = []
+
+ def line(t):
+ return (flare_center_x + t * math.cos(angle), flare_center_y + t * math.sin(angle))
+
+ for t_val in range(-flare_center_x, width - flare_center_x, 10):
+ rand_x, rand_y = line(t_val)
+ x.append(rand_x)
+ y.append(rand_y)
+
+ for _i in range(num_circles):
+ alpha = random.uniform(0.05, 0.2)
+ r = random.randint(0, len(x) - 1)
+ rad = random.randint(1, max(height // 100 - 2, 2))
+
+ r_color = random.randint(max(self.src_color[0] - 50, 0), self.src_color[0])
+ g_color = random.randint(max(self.src_color[1] - 50, 0), self.src_color[1])
+ b_color = random.randint(max(self.src_color[2] - 50, 0), self.src_color[2])
+
+ circles += [
+ (
+ alpha,
+ (int(x[r]), int(y[r])),
+ pow(rad, 3),
+ (r_color, g_color, b_color),
+ )
+ ]
+
+ return {
+ "circles": circles,
+ "flare_center_x": flare_center_x,
+ "flare_center_y": flare_center_y,
+ }
+
+ def get_transform_init_args(self):
+ return {
+ "flare_roi": (
+ self.flare_center_lower_x,
+ self.flare_center_lower_y,
+ self.flare_center_upper_x,
+ self.flare_center_upper_y,
+ ),
+ "angle_lower": self.angle_lower,
+ "angle_upper": self.angle_upper,
+ "num_flare_circles_lower": self.num_flare_circles_lower,
+ "num_flare_circles_upper": self.num_flare_circles_upper,
+ "src_radius": self.src_radius,
+ "src_color": self.src_color,
+ }
+
+
+class RandomShadow(ImageOnlyTransform):
+ """Simulates shadows for the image
+
+ From https://github.com/UjjwalSaxena/Automold--Road-Augmentation-Library
+
+ Args:
+ shadow_roi (float, float, float, float): region of the image where shadows
+ will appear (x_min, y_min, x_max, y_max). All values should be in range [0, 1].
+ num_shadows_lower (int): Lower limit for the possible number of shadows.
+ Should be in range [0, `num_shadows_upper`].
+ num_shadows_upper (int): Lower limit for the possible number of shadows.
+ Should be in range [`num_shadows_lower`, inf].
+ shadow_dimension (int): number of edges in the shadow polygons
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ shadow_roi=(0, 0.5, 1, 1),
+ num_shadows_lower=1,
+ num_shadows_upper=2,
+ shadow_dimension=5,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomShadow, self).__init__(always_apply, p)
+
+ (shadow_lower_x, shadow_lower_y, shadow_upper_x, shadow_upper_y) = shadow_roi
+
+ if not 0 <= shadow_lower_x <= shadow_upper_x <= 1 or not 0 <= shadow_lower_y <= shadow_upper_y <= 1:
+ raise ValueError("Invalid shadow_roi. Got: {}".format(shadow_roi))
+ if not 0 <= num_shadows_lower <= num_shadows_upper:
+ raise ValueError(
+ "Invalid combination of num_shadows_lower nad num_shadows_upper. Got: {}".format(
+ (num_shadows_lower, num_shadows_upper)
+ )
+ )
+
+ self.shadow_roi = shadow_roi
+
+ self.num_shadows_lower = num_shadows_lower
+ self.num_shadows_upper = num_shadows_upper
+
+ self.shadow_dimension = shadow_dimension
+
+ def apply(self, image, vertices_list=(), **params):
+ return F.add_shadow(image, vertices_list)
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ height, width = img.shape[:2]
+
+ num_shadows = random.randint(self.num_shadows_lower, self.num_shadows_upper)
+
+ x_min, y_min, x_max, y_max = self.shadow_roi
+
+ x_min = int(x_min * width)
+ x_max = int(x_max * width)
+ y_min = int(y_min * height)
+ y_max = int(y_max * height)
+
+ vertices_list = []
+
+ for _index in range(num_shadows):
+ vertex = []
+ for _dimension in range(self.shadow_dimension):
+ vertex.append((random.randint(x_min, x_max), random.randint(y_min, y_max)))
+
+ vertices = np.array([vertex], dtype=np.int32)
+ vertices_list.append(vertices)
+
+ return {"vertices_list": vertices_list}
+
+ def get_transform_init_args_names(self):
+ return (
+ "shadow_roi",
+ "num_shadows_lower",
+ "num_shadows_upper",
+ "shadow_dimension",
+ )
+
+
+class RandomToneCurve(ImageOnlyTransform):
+ """Randomly change the relationship between bright and dark areas of the image by manipulating its tone curve.
+
+ Args:
+ scale (float): standard deviation of the normal distribution.
+ Used to sample random distances to move two control points that modify the image's curve.
+ Values should be in range [0, 1]. Default: 0.1
+
+
+ Targets:
+ image
+
+ Image types:
+ uint8
+ """
+
+ def __init__(
+ self,
+ scale=0.1,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomToneCurve, self).__init__(always_apply, p)
+ self.scale = scale
+
+ def apply(self, image, low_y, high_y, **params):
+ return F.move_tone_curve(image, low_y, high_y)
+
+ def get_params(self):
+ return {
+ "low_y": np.clip(random_utils.normal(loc=0.25, scale=self.scale), 0, 1),
+ "high_y": np.clip(random_utils.normal(loc=0.75, scale=self.scale), 0, 1),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("scale",)
+
+
+class HueSaturationValue(ImageOnlyTransform):
+ """Randomly change hue, saturation and value of the input image.
+
+ Args:
+ hue_shift_limit ((int, int) or int): range for changing hue. If hue_shift_limit is a single int, the range
+ will be (-hue_shift_limit, hue_shift_limit). Default: (-20, 20).
+ sat_shift_limit ((int, int) or int): range for changing saturation. If sat_shift_limit is a single int,
+ the range will be (-sat_shift_limit, sat_shift_limit). Default: (-30, 30).
+ val_shift_limit ((int, int) or int): range for changing value. If val_shift_limit is a single int, the range
+ will be (-val_shift_limit, val_shift_limit). Default: (-20, 20).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ hue_shift_limit=20,
+ sat_shift_limit=30,
+ val_shift_limit=20,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(HueSaturationValue, self).__init__(always_apply, p)
+ self.hue_shift_limit = to_tuple(hue_shift_limit)
+ self.sat_shift_limit = to_tuple(sat_shift_limit)
+ self.val_shift_limit = to_tuple(val_shift_limit)
+
+ def apply(self, image, hue_shift=0, sat_shift=0, val_shift=0, **params):
+ if not is_rgb_image(image) and not is_grayscale_image(image):
+ raise TypeError("HueSaturationValue transformation expects 1-channel or 3-channel images.")
+ return F.shift_hsv(image, hue_shift, sat_shift, val_shift)
+
+ def get_params(self):
+ return {
+ "hue_shift": random.uniform(self.hue_shift_limit[0], self.hue_shift_limit[1]),
+ "sat_shift": random.uniform(self.sat_shift_limit[0], self.sat_shift_limit[1]),
+ "val_shift": random.uniform(self.val_shift_limit[0], self.val_shift_limit[1]),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("hue_shift_limit", "sat_shift_limit", "val_shift_limit")
+
+
+class Solarize(ImageOnlyTransform):
+ """Invert all pixel values above a threshold.
+
+ Args:
+ threshold ((int, int) or int, or (float, float) or float): range for solarizing threshold.
+ If threshold is a single value, the range will be [threshold, threshold]. Default: 128.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ any
+ """
+
+ def __init__(self, threshold=128, always_apply=False, p=0.5):
+ super(Solarize, self).__init__(always_apply, p)
+
+ if isinstance(threshold, (int, float)):
+ self.threshold = to_tuple(threshold, low=threshold)
+ else:
+ self.threshold = to_tuple(threshold, low=0)
+
+ def apply(self, image, threshold=0, **params):
+ return F.solarize(image, threshold)
+
+ def get_params(self):
+ return {"threshold": random.uniform(self.threshold[0], self.threshold[1])}
+
+ def get_transform_init_args_names(self):
+ return ("threshold",)
+
+
+class Posterize(ImageOnlyTransform):
+ """Reduce the number of bits for each color channel.
+
+ Args:
+ num_bits ((int, int) or int,
+ or list of ints [r, g, b],
+ or list of ints [[r1, r1], [g1, g2], [b1, b2]]): number of high bits.
+ If num_bits is a single value, the range will be [num_bits, num_bits].
+ Must be in range [0, 8]. Default: 4.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8
+ """
+
+ def __init__(self, num_bits=4, always_apply=False, p=0.5):
+ super(Posterize, self).__init__(always_apply, p)
+
+ if isinstance(num_bits, (list, tuple)):
+ if len(num_bits) == 3:
+ self.num_bits = [to_tuple(i, 0) for i in num_bits]
+ else:
+ self.num_bits = to_tuple(num_bits, 0)
+ else:
+ self.num_bits = to_tuple(num_bits, num_bits)
+
+ def apply(self, image, num_bits=1, **params):
+ return F.posterize(image, num_bits)
+
+ def get_params(self):
+ if len(self.num_bits) == 3:
+ return {"num_bits": [random.randint(i[0], i[1]) for i in self.num_bits]}
+ return {"num_bits": random.randint(self.num_bits[0], self.num_bits[1])}
+
+ def get_transform_init_args_names(self):
+ return ("num_bits",)
+
+
+class Equalize(ImageOnlyTransform):
+ """Equalize the image histogram.
+
+ Args:
+ mode (str): {'cv', 'pil'}. Use OpenCV or Pillow equalization method.
+ by_channels (bool): If True, use equalization by channels separately,
+ else convert image to YCbCr representation and use equalization by `Y` channel.
+ mask (np.ndarray, callable): If given, only the pixels selected by
+ the mask are included in the analysis. Maybe 1 channel or 3 channel array or callable.
+ Function signature must include `image` argument.
+ mask_params (list of str): Params for mask function.
+
+ Targets:
+ image
+
+ Image types:
+ uint8
+ """
+
+ def __init__(
+ self,
+ mode="cv",
+ by_channels=True,
+ mask=None,
+ mask_params=(),
+ always_apply=False,
+ p=0.5,
+ ):
+ modes = ["cv", "pil"]
+ if mode not in modes:
+ raise ValueError("Unsupported equalization mode. Supports: {}. " "Got: {}".format(modes, mode))
+
+ super(Equalize, self).__init__(always_apply, p)
+ self.mode = mode
+ self.by_channels = by_channels
+ self.mask = mask
+ self.mask_params = mask_params
+
+ def apply(self, image, mask=None, **params):
+ return F.equalize(image, mode=self.mode, by_channels=self.by_channels, mask=mask)
+
+ def get_params_dependent_on_targets(self, params):
+ if not callable(self.mask):
+ return {"mask": self.mask}
+
+ return {"mask": self.mask(**params)}
+
+ @property
+ def targets_as_params(self):
+ return ["image"] + list(self.mask_params)
+
+ def get_transform_init_args_names(self):
+ return ("mode", "by_channels")
+
+
+class RGBShift(ImageOnlyTransform):
+ """Randomly shift values for each channel of the input RGB image.
+
+ Args:
+ r_shift_limit ((int, int) or int): range for changing values for the red channel. If r_shift_limit is a single
+ int, the range will be (-r_shift_limit, r_shift_limit). Default: (-20, 20).
+ g_shift_limit ((int, int) or int): range for changing values for the green channel. If g_shift_limit is a
+ single int, the range will be (-g_shift_limit, g_shift_limit). Default: (-20, 20).
+ b_shift_limit ((int, int) or int): range for changing values for the blue channel. If b_shift_limit is a single
+ int, the range will be (-b_shift_limit, b_shift_limit). Default: (-20, 20).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ r_shift_limit=20,
+ g_shift_limit=20,
+ b_shift_limit=20,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RGBShift, self).__init__(always_apply, p)
+ self.r_shift_limit = to_tuple(r_shift_limit)
+ self.g_shift_limit = to_tuple(g_shift_limit)
+ self.b_shift_limit = to_tuple(b_shift_limit)
+
+ def apply(self, image, r_shift=0, g_shift=0, b_shift=0, **params):
+ if not is_rgb_image(image):
+ raise TypeError("RGBShift transformation expects 3-channel images.")
+ return F.shift_rgb(image, r_shift, g_shift, b_shift)
+
+ def get_params(self):
+ return {
+ "r_shift": random.uniform(self.r_shift_limit[0], self.r_shift_limit[1]),
+ "g_shift": random.uniform(self.g_shift_limit[0], self.g_shift_limit[1]),
+ "b_shift": random.uniform(self.b_shift_limit[0], self.b_shift_limit[1]),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("r_shift_limit", "g_shift_limit", "b_shift_limit")
+
+
+class RandomBrightnessContrast(ImageOnlyTransform):
+ """Randomly change brightness and contrast of the input image.
+
+ Args:
+ brightness_limit ((float, float) or float): factor range for changing brightness.
+ If limit is a single float, the range will be (-limit, limit). Default: (-0.2, 0.2).
+ contrast_limit ((float, float) or float): factor range for changing contrast.
+ If limit is a single float, the range will be (-limit, limit). Default: (-0.2, 0.2).
+ brightness_by_max (Boolean): If True adjust contrast by image dtype maximum,
+ else adjust contrast by image mean.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ brightness_limit=0.2,
+ contrast_limit=0.2,
+ brightness_by_max=True,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RandomBrightnessContrast, self).__init__(always_apply, p)
+ self.brightness_limit = to_tuple(brightness_limit)
+ self.contrast_limit = to_tuple(contrast_limit)
+ self.brightness_by_max = brightness_by_max
+
+ def apply(self, img, alpha=1.0, beta=0.0, **params):
+ return F.brightness_contrast_adjust(img, alpha, beta, self.brightness_by_max)
+
+ def get_params(self):
+ return {
+ "alpha": 1.0 + random.uniform(self.contrast_limit[0], self.contrast_limit[1]),
+ "beta": 0.0 + random.uniform(self.brightness_limit[0], self.brightness_limit[1]),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("brightness_limit", "contrast_limit", "brightness_by_max")
+
+
+class RandomBrightness(RandomBrightnessContrast):
+ """Randomly change brightness of the input image.
+
+ Args:
+ limit ((float, float) or float): factor range for changing brightness.
+ If limit is a single float, the range will be (-limit, limit). Default: (-0.2, 0.2).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, limit=0.2, always_apply=False, p=0.5):
+ super(RandomBrightness, self).__init__(brightness_limit=limit, contrast_limit=0, always_apply=always_apply, p=p)
+ warnings.warn(
+ "This class has been deprecated. Please use RandomBrightnessContrast",
+ FutureWarning,
+ )
+
+ def get_transform_init_args(self):
+ return {"limit": self.brightness_limit}
+
+
+class RandomContrast(RandomBrightnessContrast):
+ """Randomly change contrast of the input image.
+
+ Args:
+ limit ((float, float) or float): factor range for changing contrast.
+ If limit is a single float, the range will be (-limit, limit). Default: (-0.2, 0.2).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, limit=0.2, always_apply=False, p=0.5):
+ super(RandomContrast, self).__init__(brightness_limit=0, contrast_limit=limit, always_apply=always_apply, p=p)
+ warnings.warn(
+ f"{self.__class__.__name__} has been deprecated. Please use RandomBrightnessContrast",
+ FutureWarning,
+ )
+
+ def get_transform_init_args(self):
+ return {"limit": self.contrast_limit}
+
+
+class GaussNoise(ImageOnlyTransform):
+ """Apply gaussian noise to the input image.
+
+ Args:
+ var_limit ((float, float) or float): variance range for noise. If var_limit is a single float, the range
+ will be (0, var_limit). Default: (10.0, 50.0).
+ mean (float): mean of the noise. Default: 0
+ per_channel (bool): if set to True, noise will be sampled for each channel independently.
+ Otherwise, the noise will be sampled once for all channels. Default: True
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, var_limit=(10.0, 50.0), mean=0, per_channel=True, always_apply=False, p=0.5):
+ super(GaussNoise, self).__init__(always_apply, p)
+ if isinstance(var_limit, (tuple, list)):
+ if var_limit[0] < 0:
+ raise ValueError("Lower var_limit should be non negative.")
+ if var_limit[1] < 0:
+ raise ValueError("Upper var_limit should be non negative.")
+ self.var_limit = var_limit
+ elif isinstance(var_limit, (int, float)):
+ if var_limit < 0:
+ raise ValueError("var_limit should be non negative.")
+
+ self.var_limit = (0, var_limit)
+ else:
+ raise TypeError(
+ "Expected var_limit type to be one of (int, float, tuple, list), got {}".format(type(var_limit))
+ )
+
+ self.mean = mean
+ self.per_channel = per_channel
+
+ def apply(self, img, gauss=None, **params):
+ return F.gauss_noise(img, gauss=gauss)
+
+ def get_params_dependent_on_targets(self, params):
+ image = params["image"]
+ var = random.uniform(self.var_limit[0], self.var_limit[1])
+ sigma = var**0.5
+
+ if self.per_channel:
+ gauss = random_utils.normal(self.mean, sigma, image.shape)
+ else:
+ gauss = random_utils.normal(self.mean, sigma, image.shape[:2])
+ if len(image.shape) == 3:
+ gauss = np.expand_dims(gauss, -1)
+
+ return {"gauss": gauss}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return ("var_limit", "per_channel", "mean")
+
+
+class ISONoise(ImageOnlyTransform):
+ """
+ Apply camera sensor noise.
+
+ Args:
+ color_shift (float, float): variance range for color hue change.
+ Measured as a fraction of 360 degree Hue angle in HLS colorspace.
+ intensity ((float, float): Multiplicative factor that control strength
+ of color and luminace noise.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8
+ """
+
+ def __init__(self, color_shift=(0.01, 0.05), intensity=(0.1, 0.5), always_apply=False, p=0.5):
+ super(ISONoise, self).__init__(always_apply, p)
+ self.intensity = intensity
+ self.color_shift = color_shift
+
+ def apply(self, img, color_shift=0.05, intensity=1.0, random_state=None, **params):
+ return F.iso_noise(img, color_shift, intensity, np.random.RandomState(random_state))
+
+ def get_params(self):
+ return {
+ "color_shift": random.uniform(self.color_shift[0], self.color_shift[1]),
+ "intensity": random.uniform(self.intensity[0], self.intensity[1]),
+ "random_state": random.randint(0, 65536),
+ }
+
+ def get_transform_init_args_names(self):
+ return ("intensity", "color_shift")
+
+
+class CLAHE(ImageOnlyTransform):
+ """Apply Contrast Limited Adaptive Histogram Equalization to the input image.
+
+ Args:
+ clip_limit (float or (float, float)): upper threshold value for contrast limiting.
+ If clip_limit is a single float value, the range will be (1, clip_limit). Default: (1, 4).
+ tile_grid_size ((int, int)): size of grid for histogram equalization. Default: (8, 8).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8
+ """
+
+ def __init__(self, clip_limit=4.0, tile_grid_size=(8, 8), always_apply=False, p=0.5):
+ super(CLAHE, self).__init__(always_apply, p)
+ self.clip_limit = to_tuple(clip_limit, 1)
+ self.tile_grid_size = tuple(tile_grid_size)
+
+ def apply(self, img, clip_limit=2, **params):
+ if not is_rgb_image(img) and not is_grayscale_image(img):
+ raise TypeError("CLAHE transformation expects 1-channel or 3-channel images.")
+
+ return F.clahe(img, clip_limit, self.tile_grid_size)
+
+ def get_params(self):
+ return {"clip_limit": random.uniform(self.clip_limit[0], self.clip_limit[1])}
+
+ def get_transform_init_args_names(self):
+ return ("clip_limit", "tile_grid_size")
+
+
+class ChannelShuffle(ImageOnlyTransform):
+ """Randomly rearrange channels of the input RGB image.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def apply(self, img, channels_shuffled=(0, 1, 2), **params):
+ return F.channel_shuffle(img, channels_shuffled)
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ ch_arr = list(range(img.shape[2]))
+ random.shuffle(ch_arr)
+ return {"channels_shuffled": ch_arr}
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class InvertImg(ImageOnlyTransform):
+ """Invert the input image by subtracting pixel values from max values of the image types,
+ i.e., 255 for uint8 and 1.0 for float32.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img, **params):
+ return F.invert(img)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class RandomGamma(ImageOnlyTransform):
+ """
+ Args:
+ gamma_limit (float or (float, float)): If gamma_limit is a single float value,
+ the range will be (-gamma_limit, gamma_limit). Default: (80, 120).
+ eps: Deprecated.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, gamma_limit=(80, 120), eps=None, always_apply=False, p=0.5):
+ super(RandomGamma, self).__init__(always_apply, p)
+ self.gamma_limit = to_tuple(gamma_limit)
+ self.eps = eps
+
+ def apply(self, img, gamma=1, **params):
+ return F.gamma_transform(img, gamma=gamma)
+
+ def get_params(self):
+ return {"gamma": random.uniform(self.gamma_limit[0], self.gamma_limit[1]) / 100.0}
+
+ def get_transform_init_args_names(self):
+ return ("gamma_limit", "eps")
+
+
+class ToGray(ImageOnlyTransform):
+ """Convert the input RGB image to grayscale. If the mean pixel value for the resulting image is greater
+ than 127, invert the resulting grayscale image.
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def apply(self, img, **params):
+ if is_grayscale_image(img):
+ warnings.warn("The image is already gray.")
+ return img
+ if not is_rgb_image(img):
+ raise TypeError("ToGray transformation expects 3-channel images.")
+
+ return F.to_gray(img)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class ToRGB(ImageOnlyTransform):
+ """Convert the input grayscale image to RGB.
+
+ Args:
+ p (float): probability of applying the transform. Default: 1.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, always_apply=True, p=1.0):
+ super(ToRGB, self).__init__(always_apply=always_apply, p=p)
+
+ def apply(self, img, **params):
+ if is_rgb_image(img):
+ warnings.warn("The image is already an RGB.")
+ return img
+ if not is_grayscale_image(img):
+ raise TypeError("ToRGB transformation expects 2-dim images or 3-dim with the last dimension equal to 1.")
+
+ return F.gray_to_rgb(img)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class ToSepia(ImageOnlyTransform):
+ """Applies sepia filter to the input RGB image
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(self, always_apply=False, p=0.5):
+ super(ToSepia, self).__init__(always_apply, p)
+ self.sepia_transformation_matrix = np.array(
+ [[0.393, 0.769, 0.189], [0.349, 0.686, 0.168], [0.272, 0.534, 0.131]]
+ )
+
+ def apply(self, image, **params):
+ if not is_rgb_image(image):
+ raise TypeError("ToSepia transformation expects 3-channel images.")
+ return F.linear_transformation_rgb(image, self.sepia_transformation_matrix)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class ToFloat(ImageOnlyTransform):
+ """Divide pixel values by `max_value` to get a float32 output array where all values lie in the range [0, 1.0].
+ If `max_value` is None the transform will try to infer the maximum value by inspecting the data type of the input
+ image.
+
+ See Also:
+ :class:`~albumentations.augmentations.transforms.FromFloat`
+
+ Args:
+ max_value (float): maximum possible input value. Default: None.
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image
+
+ Image types:
+ any type
+
+ """
+
+ def __init__(self, max_value=None, always_apply=False, p=1.0):
+ super(ToFloat, self).__init__(always_apply, p)
+ self.max_value = max_value
+
+ def apply(self, img, **params):
+ return F.to_float(img, self.max_value)
+
+ def get_transform_init_args_names(self):
+ return ("max_value",)
+
+
+class FromFloat(ImageOnlyTransform):
+ """Take an input array where all values should lie in the range [0, 1.0], multiply them by `max_value` and then
+ cast the resulted value to a type specified by `dtype`. If `max_value` is None the transform will try to infer
+ the maximum value for the data type from the `dtype` argument.
+
+ This is the inverse transform for :class:`~albumentations.augmentations.transforms.ToFloat`.
+
+ Args:
+ max_value (float): maximum possible input value. Default: None.
+ dtype (string or numpy data type): data type of the output. See the `'Data types' page from the NumPy docs`_.
+ Default: 'uint16'.
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image
+
+ Image types:
+ float32
+
+ .. _'Data types' page from the NumPy docs:
+ https://docs.scipy.org/doc/numpy/user/basics.types.html
+ """
+
+ def __init__(self, dtype="uint16", max_value=None, always_apply=False, p=1.0):
+ super(FromFloat, self).__init__(always_apply, p)
+ self.dtype = np.dtype(dtype)
+ self.max_value = max_value
+
+ def apply(self, img, **params):
+ return F.from_float(img, self.dtype, self.max_value)
+
+ def get_transform_init_args(self):
+ return {"dtype": self.dtype.name, "max_value": self.max_value}
+
+
+class Downscale(ImageOnlyTransform):
+ """Decreases image quality by downscaling and upscaling back.
+
+ Args:
+ scale_min (float): lower bound on the image scale. Should be < 1.
+ scale_max (float): lower bound on the image scale. Should be .
+ interpolation: cv2 interpolation method. Could be:
+ - single cv2 interpolation flag - selected method will be used for downscale and upscale.
+ - dict(downscale=flag, upscale=flag)
+ - Downscale.Interpolation(downscale=flag, upscale=flag) -
+ Default: Interpolation(downscale=cv2.INTER_NEAREST, upscale=cv2.INTER_NEAREST)
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+ """
+
+ class Interpolation:
+ def __init__(self, *, downscale: int = cv2.INTER_NEAREST, upscale: int = cv2.INTER_NEAREST):
+ self.downscale = downscale
+ self.upscale = upscale
+
+ def __init__(
+ self,
+ scale_min: float = 0.25,
+ scale_max: float = 0.25,
+ interpolation: Optional[Union[int, Interpolation, Dict[str, int]]] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super(Downscale, self).__init__(always_apply, p)
+ if interpolation is None:
+ self.interpolation = self.Interpolation(downscale=cv2.INTER_NEAREST, upscale=cv2.INTER_NEAREST)
+ warnings.warn(
+ "Using default interpolation INTER_NEAREST, which is sub-optimal."
+ "Please specify interpolation mode for downscale and upscale explicitly."
+ "For additional information see this PR https://github.com/albumentations-team/albumentations/pull/584"
+ )
+ elif isinstance(interpolation, int):
+ self.interpolation = self.Interpolation(downscale=interpolation, upscale=interpolation)
+ elif isinstance(interpolation, self.Interpolation):
+ self.interpolation = interpolation
+ elif isinstance(interpolation, dict):
+ self.interpolation = self.Interpolation(**interpolation)
+ else:
+ raise ValueError(
+ "Wrong interpolation data type. Supported types: `Optional[Union[int, Interpolation, Dict[str, int]]]`."
+ f" Got: {type(interpolation)}"
+ )
+
+ if scale_min > scale_max:
+ raise ValueError("Expected scale_min be less or equal scale_max, got {} {}".format(scale_min, scale_max))
+ if scale_max >= 1:
+ raise ValueError("Expected scale_max to be less than 1, got {}".format(scale_max))
+ self.scale_min = scale_min
+ self.scale_max = scale_max
+
+ def apply(self, img: np.ndarray, scale: Optional[float] = None, **params) -> np.ndarray:
+ return F.downscale(
+ img,
+ scale=scale,
+ down_interpolation=self.interpolation.downscale,
+ up_interpolation=self.interpolation.upscale,
+ )
+
+ def get_params(self) -> Dict[str, Any]:
+ return {"scale": random.uniform(self.scale_min, self.scale_max)}
+
+ def get_transform_init_args_names(self) -> Tuple[str, str]:
+ return "scale_min", "scale_max"
+
+ def _to_dict(self) -> Dict[str, Any]:
+ result = super()._to_dict()
+ result["interpolation"] = {"upscale": self.interpolation.upscale, "downscale": self.interpolation.downscale}
+ return result
+
+
+class Lambda(NoOp):
+ """A flexible transformation class for using user-defined transformation functions per targets.
+ Function signature must include **kwargs to accept optinal arguments like interpolation method, image size, etc:
+
+ Args:
+ image (callable): Image transformation function.
+ mask (callable): Mask transformation function.
+ keypoint (callable): Keypoint transformation function.
+ bbox (callable): BBox transformation function.
+ always_apply (bool): Indicates whether this transformation should be always applied.
+ p (float): probability of applying the transform. Default: 1.0.
+
+ Targets:
+ image, mask, bboxes, keypoints
+
+ Image types:
+ Any
+ """
+
+ def __init__(
+ self,
+ image=None,
+ mask=None,
+ keypoint=None,
+ bbox=None,
+ name=None,
+ always_apply=False,
+ p=1.0,
+ ):
+ super(Lambda, self).__init__(always_apply, p)
+
+ self.name = name
+ self.custom_apply_fns = {target_name: F.noop for target_name in ("image", "mask", "keypoint", "bbox")}
+ for target_name, custom_apply_fn in {
+ "image": image,
+ "mask": mask,
+ "keypoint": keypoint,
+ "bbox": bbox,
+ }.items():
+ if custom_apply_fn is not None:
+ if isinstance(custom_apply_fn, LambdaType) and custom_apply_fn.__name__ == "":
+ warnings.warn(
+ "Using lambda is incompatible with multiprocessing. "
+ "Consider using regular functions or partial()."
+ )
+
+ self.custom_apply_fns[target_name] = custom_apply_fn
+
+ def apply(self, img, **params):
+ fn = self.custom_apply_fns["image"]
+ return fn(img, **params)
+
+ def apply_to_mask(self, mask, **params):
+ fn = self.custom_apply_fns["mask"]
+ return fn(mask, **params)
+
+ def apply_to_bbox(self, bbox, **params):
+ fn = self.custom_apply_fns["bbox"]
+ return fn(bbox, **params)
+
+ def apply_to_keypoint(self, keypoint, **params):
+ fn = self.custom_apply_fns["keypoint"]
+ return fn(keypoint, **params)
+
+ @classmethod
+ def is_serializable(cls):
+ return False
+
+ def _to_dict(self):
+ if self.name is None:
+ raise ValueError(
+ "To make a Lambda transform serializable you should provide the `name` argument, "
+ "e.g. `Lambda(name='my_transform', image=, ...)`."
+ )
+ return {"__class_fullname__": self.get_class_fullname(), "__name__": self.name}
+
+ def __repr__(self):
+ state = {"name": self.name}
+ state.update(self.custom_apply_fns.items())
+ state.update(self.get_base_init_args())
+ return "{name}({args})".format(name=self.__class__.__name__, args=format_args(state))
+
+
+class MultiplicativeNoise(ImageOnlyTransform):
+ """Multiply image to random number or array of numbers.
+
+ Args:
+ multiplier (float or tuple of floats): If single float image will be multiplied to this number.
+ If tuple of float multiplier will be in range `[multiplier[0], multiplier[1])`. Default: (0.9, 1.1).
+ per_channel (bool): If `False`, same values for all channels will be used.
+ If `True` use sample values for each channels. Default False.
+ elementwise (bool): If `False` multiply multiply all pixels in an image with a random value sampled once.
+ If `True` Multiply image pixels with values that are pixelwise randomly sampled. Defaule: False.
+
+ Targets:
+ image
+
+ Image types:
+ Any
+ """
+
+ def __init__(
+ self,
+ multiplier=(0.9, 1.1),
+ per_channel=False,
+ elementwise=False,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(MultiplicativeNoise, self).__init__(always_apply, p)
+ self.multiplier = to_tuple(multiplier, multiplier)
+ self.per_channel = per_channel
+ self.elementwise = elementwise
+
+ def apply(self, img, multiplier=np.array([1]), **kwargs):
+ return F.multiply(img, multiplier)
+
+ def get_params_dependent_on_targets(self, params):
+ if self.multiplier[0] == self.multiplier[1]:
+ return {"multiplier": np.array([self.multiplier[0]])}
+
+ img = params["image"]
+
+ h, w = img.shape[:2]
+
+ if self.per_channel:
+ c = 1 if is_grayscale_image(img) else img.shape[-1]
+ else:
+ c = 1
+
+ if self.elementwise:
+ shape = [h, w, c]
+ else:
+ shape = [c]
+
+ multiplier = random_utils.uniform(self.multiplier[0], self.multiplier[1], shape)
+ if is_grayscale_image(img) and img.ndim == 2:
+ multiplier = np.squeeze(multiplier)
+
+ return {"multiplier": multiplier}
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def get_transform_init_args_names(self):
+ return "multiplier", "per_channel", "elementwise"
+
+
+class FancyPCA(ImageOnlyTransform):
+ """Augment RGB image using FancyPCA from Krizhevsky's paper
+ "ImageNet Classification with Deep Convolutional Neural Networks"
+
+ Args:
+ alpha (float): how much to perturb/scale the eigen vecs and vals.
+ scale is samples from gaussian distribution (mu=0, sigma=alpha)
+
+ Targets:
+ image
+
+ Image types:
+ 3-channel uint8 images only
+
+ Credit:
+ http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+ https://deshanadesai.github.io/notes/Fancy-PCA-with-Scikit-Image
+ https://pixelatedbrian.github.io/2018-04-29-fancy_pca/
+ """
+
+ def __init__(self, alpha=0.1, always_apply=False, p=0.5):
+ super(FancyPCA, self).__init__(always_apply=always_apply, p=p)
+ self.alpha = alpha
+
+ def apply(self, img, alpha=0.1, **params):
+ img = F.fancy_pca(img, alpha)
+ return img
+
+ def get_params(self):
+ return {"alpha": random.gauss(0, self.alpha)}
+
+ def get_transform_init_args_names(self):
+ return ("alpha",)
+
+
+class ColorJitter(ImageOnlyTransform):
+ """Randomly changes the brightness, contrast, and saturation of an image. Compared to ColorJitter from torchvision,
+ this transform gives a little bit different results because Pillow (used in torchvision) and OpenCV (used in
+ Albumentations) transform an image to HSV format by different formulas. Another difference - Pillow uses uint8
+ overflow, but we use value saturation.
+
+ Args:
+ brightness (float or tuple of float (min, max)): How much to jitter brightness.
+ brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+ or the given [min, max]. Should be non negative numbers.
+ contrast (float or tuple of float (min, max)): How much to jitter contrast.
+ contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+ or the given [min, max]. Should be non negative numbers.
+ saturation (float or tuple of float (min, max)): How much to jitter saturation.
+ saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+ or the given [min, max]. Should be non negative numbers.
+ hue (float or tuple of float (min, max)): How much to jitter hue.
+ hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+ Should have 0 <= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+ """
+
+ def __init__(
+ self,
+ brightness=0.2,
+ contrast=0.2,
+ saturation=0.2,
+ hue=0.2,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(ColorJitter, self).__init__(always_apply=always_apply, p=p)
+
+ self.brightness = self.__check_values(brightness, "brightness")
+ self.contrast = self.__check_values(contrast, "contrast")
+ self.saturation = self.__check_values(saturation, "saturation")
+ self.hue = self.__check_values(hue, "hue", offset=0, bounds=[-0.5, 0.5], clip=False)
+
+ self.transforms = [
+ F.adjust_brightness_torchvision,
+ F.adjust_contrast_torchvision,
+ F.adjust_saturation_torchvision,
+ F.adjust_hue_torchvision,
+ ]
+
+ @staticmethod
+ def __check_values(value, name, offset=1, bounds=(0, float("inf")), clip=True):
+ if isinstance(value, numbers.Number):
+ if value < 0:
+ raise ValueError("If {} is a single number, it must be non negative.".format(name))
+ value = [offset - value, offset + value]
+ if clip:
+ value[0] = max(value[0], 0)
+ elif isinstance(value, (tuple, list)) and len(value) == 2:
+ if not bounds[0] <= value[0] <= value[1] <= bounds[1]:
+ raise ValueError("{} values should be between {}".format(name, bounds))
+ else:
+ raise TypeError("{} should be a single number or a list/tuple with length 2.".format(name))
+
+ return value
+
+ def get_params(self):
+ brightness = random.uniform(self.brightness[0], self.brightness[1])
+ contrast = random.uniform(self.contrast[0], self.contrast[1])
+ saturation = random.uniform(self.saturation[0], self.saturation[1])
+ hue = random.uniform(self.hue[0], self.hue[1])
+
+ order = [0, 1, 2, 3]
+ random.shuffle(order)
+
+ return {
+ "brightness": brightness,
+ "contrast": contrast,
+ "saturation": saturation,
+ "hue": hue,
+ "order": order,
+ }
+
+ def apply(self, img, brightness=1.0, contrast=1.0, saturation=1.0, hue=0, order=[0, 1, 2, 3], **params):
+ if not is_rgb_image(img) and not is_grayscale_image(img):
+ raise TypeError("ColorJitter transformation expects 1-channel or 3-channel images.")
+ params = [brightness, contrast, saturation, hue]
+ for i in order:
+ img = self.transforms[i](img, params[i])
+ return img
+
+ def get_transform_init_args_names(self):
+ return ("brightness", "contrast", "saturation", "hue")
+
+
+class Sharpen(ImageOnlyTransform):
+ """Sharpen the input image and overlays the result with the original image.
+
+ Args:
+ alpha ((float, float)): range to choose the visibility of the sharpened image. At 0, only the original image is
+ visible, at 1.0 only its sharpened version is visible. Default: (0.2, 0.5).
+ lightness ((float, float)): range to choose the lightness of the sharpened image. Default: (0.5, 1.0).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(self, alpha=(0.2, 0.5), lightness=(0.5, 1.0), always_apply=False, p=0.5):
+ super(Sharpen, self).__init__(always_apply, p)
+ self.alpha = self.__check_values(to_tuple(alpha, 0.0), name="alpha", bounds=(0.0, 1.0))
+ self.lightness = self.__check_values(to_tuple(lightness, 0.0), name="lightness")
+
+ @staticmethod
+ def __check_values(value, name, bounds=(0, float("inf"))):
+ if not bounds[0] <= value[0] <= value[1] <= bounds[1]:
+ raise ValueError("{} values should be between {}".format(name, bounds))
+ return value
+
+ @staticmethod
+ def __generate_sharpening_matrix(alpha_sample, lightness_sample):
+ matrix_nochange = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]], dtype=np.float32)
+ matrix_effect = np.array(
+ [[-1, -1, -1], [-1, 8 + lightness_sample, -1], [-1, -1, -1]],
+ dtype=np.float32,
+ )
+
+ matrix = (1 - alpha_sample) * matrix_nochange + alpha_sample * matrix_effect
+ return matrix
+
+ def get_params(self):
+ alpha = random.uniform(*self.alpha)
+ lightness = random.uniform(*self.lightness)
+ sharpening_matrix = self.__generate_sharpening_matrix(alpha_sample=alpha, lightness_sample=lightness)
+ return {"sharpening_matrix": sharpening_matrix}
+
+ def apply(self, img, sharpening_matrix=None, **params):
+ return F.convolve(img, sharpening_matrix)
+
+ def get_transform_init_args_names(self):
+ return ("alpha", "lightness")
+
+
+class Emboss(ImageOnlyTransform):
+ """Emboss the input image and overlays the result with the original image.
+
+ Args:
+ alpha ((float, float)): range to choose the visibility of the embossed image. At 0, only the original image is
+ visible,at 1.0 only its embossed version is visible. Default: (0.2, 0.5).
+ strength ((float, float)): strength range of the embossing. Default: (0.2, 0.7).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(self, alpha=(0.2, 0.5), strength=(0.2, 0.7), always_apply=False, p=0.5):
+ super(Emboss, self).__init__(always_apply, p)
+ self.alpha = self.__check_values(to_tuple(alpha, 0.0), name="alpha", bounds=(0.0, 1.0))
+ self.strength = self.__check_values(to_tuple(strength, 0.0), name="strength")
+
+ @staticmethod
+ def __check_values(value, name, bounds=(0, float("inf"))):
+ if not bounds[0] <= value[0] <= value[1] <= bounds[1]:
+ raise ValueError("{} values should be between {}".format(name, bounds))
+ return value
+
+ @staticmethod
+ def __generate_emboss_matrix(alpha_sample, strength_sample):
+ matrix_nochange = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]], dtype=np.float32)
+ matrix_effect = np.array(
+ [
+ [-1 - strength_sample, 0 - strength_sample, 0],
+ [0 - strength_sample, 1, 0 + strength_sample],
+ [0, 0 + strength_sample, 1 + strength_sample],
+ ],
+ dtype=np.float32,
+ )
+ matrix = (1 - alpha_sample) * matrix_nochange + alpha_sample * matrix_effect
+ return matrix
+
+ def get_params(self):
+ alpha = random.uniform(*self.alpha)
+ strength = random.uniform(*self.strength)
+ emboss_matrix = self.__generate_emboss_matrix(alpha_sample=alpha, strength_sample=strength)
+ return {"emboss_matrix": emboss_matrix}
+
+ def apply(self, img, emboss_matrix=None, **params):
+ return F.convolve(img, emboss_matrix)
+
+ def get_transform_init_args_names(self):
+ return ("alpha", "strength")
+
+
+class Superpixels(ImageOnlyTransform):
+ """Transform images partially/completely to their superpixel representation.
+ This implementation uses skimage's version of the SLIC algorithm.
+
+ Args:
+ p_replace (float or tuple of float): Defines for any segment the probability that the pixels within that
+ segment are replaced by their average color (otherwise, the pixels are not changed).
+ Examples:
+ * A probability of ``0.0`` would mean, that the pixels in no
+ segment are replaced by their average color (image is not
+ changed at all).
+ * A probability of ``0.5`` would mean, that around half of all
+ segments are replaced by their average color.
+ * A probability of ``1.0`` would mean, that all segments are
+ replaced by their average color (resulting in a voronoi
+ image).
+ Behaviour based on chosen data types for this parameter:
+ * If a ``float``, then that ``flat`` will always be used.
+ * If ``tuple`` ``(a, b)``, then a random probability will be
+ sampled from the interval ``[a, b]`` per image.
+ n_segments (int, or tuple of int): Rough target number of how many superpixels to generate (the algorithm
+ may deviate from this number). Lower value will lead to coarser superpixels.
+ Higher values are computationally more intensive and will hence lead to a slowdown
+ * If a single ``int``, then that value will always be used as the
+ number of segments.
+ * If a ``tuple`` ``(a, b)``, then a value from the discrete
+ interval ``[a..b]`` will be sampled per image.
+ max_size (int or None): Maximum image size at which the augmentation is performed.
+ If the width or height of an image exceeds this value, it will be
+ downscaled before the augmentation so that the longest side matches `max_size`.
+ This is done to speed up the process. The final output image has the same size as the input image.
+ Note that in case `p_replace` is below ``1.0``,
+ the down-/upscaling will affect the not-replaced pixels too.
+ Use ``None`` to apply no down-/upscaling.
+ interpolation (OpenCV flag): flag that is used to specify the interpolation algorithm. Should be one of:
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
+ Default: cv2.INTER_LINEAR.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(
+ self,
+ p_replace: Union[float, Sequence[float]] = 0.1,
+ n_segments: Union[int, Sequence[int]] = 100,
+ max_size: Optional[int] = 128,
+ interpolation: int = cv2.INTER_LINEAR,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply=always_apply, p=p)
+ self.p_replace = to_tuple(p_replace, p_replace)
+ self.n_segments = to_tuple(n_segments, n_segments)
+ self.max_size = max_size
+ self.interpolation = interpolation
+
+ if min(self.n_segments) < 1:
+ raise ValueError(f"n_segments must be >= 1. Got: {n_segments}")
+
+ def get_transform_init_args_names(self) -> Tuple[str, str, str, str]:
+ return ("p_replace", "n_segments", "max_size", "interpolation")
+
+ def get_params(self) -> dict:
+ n_segments = random.randint(*self.n_segments)
+ p = random.uniform(*self.p_replace)
+ return {"replace_samples": random_utils.random(n_segments) < p, "n_segments": n_segments}
+
+ def apply(self, img: np.ndarray, replace_samples: Sequence[bool] = (False,), n_segments: int = 1, **kwargs):
+ return F.superpixels(img, n_segments, replace_samples, self.max_size, self.interpolation)
+
+
+class TemplateTransform(ImageOnlyTransform):
+ """
+ Apply blending of input image with specified templates
+ Args:
+ templates (numpy array or list of numpy arrays): Images as template for transform.
+ img_weight ((float, float) or float): If single float will be used as weight for input image.
+ If tuple of float img_weight will be in range `[img_weight[0], img_weight[1])`. Default: 0.5.
+ template_weight ((float, float) or float): If single float will be used as weight for template.
+ If tuple of float template_weight will be in range `[template_weight[0], template_weight[1])`.
+ Default: 0.5.
+ template_transform: transformation object which could be applied to template,
+ must produce template the same size as input image.
+ name (string): (Optional) Name of transform, used only for deserialization.
+ p (float): probability of applying the transform. Default: 0.5.
+ Targets:
+ image
+ Image types:
+ uint8, float32
+ """
+
+ def __init__(
+ self,
+ templates,
+ img_weight=0.5,
+ template_weight=0.5,
+ template_transform=None,
+ name=None,
+ always_apply=False,
+ p=0.5,
+ ):
+ super().__init__(always_apply, p)
+
+ self.templates = templates if isinstance(templates, (list, tuple)) else [templates]
+ self.img_weight = to_tuple(img_weight, img_weight)
+ self.template_weight = to_tuple(template_weight, template_weight)
+ self.template_transform = template_transform
+ self.name = name
+
+ def apply(self, img, template=None, img_weight=0.5, template_weight=0.5, **params):
+ return F.add_weighted(img, img_weight, template, template_weight)
+
+ def get_params(self):
+ return {
+ "img_weight": random.uniform(self.img_weight[0], self.img_weight[1]),
+ "template_weight": random.uniform(self.template_weight[0], self.template_weight[1]),
+ }
+
+ def get_params_dependent_on_targets(self, params):
+ img = params["image"]
+ template = random.choice(self.templates)
+
+ if self.template_transform is not None:
+ template = self.template_transform(image=template)["image"]
+
+ if get_num_channels(template) not in [1, get_num_channels(img)]:
+ raise ValueError(
+ "Template must be a single channel or "
+ "has the same number of channels as input image ({}), got {}".format(
+ get_num_channels(img), get_num_channels(template)
+ )
+ )
+
+ if template.dtype != img.dtype:
+ raise ValueError("Image and template must be the same image type")
+
+ if img.shape[:2] != template.shape[:2]:
+ raise ValueError(
+ "Image and template must be the same size, got {} and {}".format(img.shape[:2], template.shape[:2])
+ )
+
+ if get_num_channels(template) == 1 and get_num_channels(img) > 1:
+ template = np.stack((template,) * get_num_channels(img), axis=-1)
+
+ # in order to support grayscale image with dummy dim
+ template = template.reshape(img.shape)
+
+ return {"template": template}
+
+ @classmethod
+ def is_serializable(cls):
+ return False
+
+ @property
+ def targets_as_params(self):
+ return ["image"]
+
+ def _to_dict(self):
+ if self.name is None:
+ raise ValueError(
+ "To make a TemplateTransform serializable you should provide the `name` argument, "
+ "e.g. `TemplateTransform(name='my_transform', ...)`."
+ )
+ return {"__class_fullname__": self.get_class_fullname(), "__name__": self.name}
+
+
+class RingingOvershoot(ImageOnlyTransform):
+ """Create ringing or overshoot artefacts by conlvolving image with 2D sinc filter.
+
+ Args:
+ blur_limit (int, (int, int)): maximum kernel size for sinc filter.
+ Should be in range [3, inf). Default: (7, 15).
+ cutoff (float, (float, float)): range to choose the cutoff frequency in radians.
+ Should be in range (0, np.pi)
+ Default: (np.pi / 4, np.pi / 2).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Reference:
+ dsp.stackexchange.com/questions/58301/2-d-circularly-symmetric-low-pass-filter
+ https://arxiv.org/abs/2107.10833
+
+ Targets:
+ image
+ """
+
+ def __init__(
+ self,
+ blur_limit: Union[int, Sequence[int]] = (7, 15),
+ cutoff: Union[float, Sequence[float]] = (np.pi / 4, np.pi / 2),
+ always_apply=False,
+ p=0.5,
+ ):
+ super(RingingOvershoot, self).__init__(always_apply, p)
+ self.blur_limit = to_tuple(blur_limit, 3)
+ self.cutoff = self.__check_values(to_tuple(cutoff, np.pi / 2), name="cutoff", bounds=(0, np.pi))
+
+ @staticmethod
+ def __check_values(value, name, bounds=(0, float("inf"))):
+ if not bounds[0] <= value[0] <= value[1] <= bounds[1]:
+ raise ValueError(f"{name} values should be between {bounds}")
+ return value
+
+ def get_params(self):
+ ksize = random.randrange(self.blur_limit[0], self.blur_limit[1] + 1, 2)
+ if ksize % 2 == 0:
+ raise ValueError(f"Kernel size must be odd. Got: {ksize}")
+
+ cutoff = random.uniform(*self.cutoff)
+
+ # From dsp.stackexchange.com/questions/58301/2-d-circularly-symmetric-low-pass-filter
+ with np.errstate(divide="ignore", invalid="ignore"):
+ kernel = np.fromfunction(
+ lambda x, y: cutoff
+ * special.j1(cutoff * np.sqrt((x - (ksize - 1) / 2) ** 2 + (y - (ksize - 1) / 2) ** 2))
+ / (2 * np.pi * np.sqrt((x - (ksize - 1) / 2) ** 2 + (y - (ksize - 1) / 2) ** 2)),
+ [ksize, ksize],
+ )
+ kernel[(ksize - 1) // 2, (ksize - 1) // 2] = cutoff**2 / (4 * np.pi)
+
+ # Normalize kernel
+ kernel = kernel.astype(np.float32) / np.sum(kernel)
+
+ return {"kernel": kernel}
+
+ def apply(self, img, kernel=None, **params):
+ return F.convolve(img, kernel)
+
+ def get_transform_init_args_names(self):
+ return ("blur_limit", "cutoff")
+
+
+class UnsharpMask(ImageOnlyTransform):
+ """
+ Sharpen the input image using Unsharp Masking processing and overlays the result with the original image.
+
+ Args:
+ blur_limit (int, (int, int)): maximum Gaussian kernel size for blurring the input image.
+ Must be zero or odd and in range [0, inf). If set to 0 it will be computed from sigma
+ as `round(sigma * (3 if img.dtype == np.uint8 else 4) * 2 + 1) + 1`.
+ If set single value `blur_limit` will be in range (0, blur_limit).
+ Default: (3, 7).
+ sigma_limit (float, (float, float)): Gaussian kernel standard deviation. Must be in range [0, inf).
+ If set single value `sigma_limit` will be in range (0, sigma_limit).
+ If set to 0 sigma will be computed as `sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`. Default: 0.
+ alpha (float, (float, float)): range to choose the visibility of the sharpened image.
+ At 0, only the original image is visible, at 1.0 only its sharpened version is visible.
+ Default: (0.2, 0.5).
+ threshold (int): Value to limit sharpening only for areas with high pixel difference between original image
+ and it's smoothed version. Higher threshold means less sharpening on flat areas.
+ Must be in range [0, 255]. Default: 10.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Reference:
+ arxiv.org/pdf/2107.10833.pdf
+
+ Targets:
+ image
+ """
+
+ def __init__(
+ self,
+ blur_limit: Union[int, Sequence[int]] = (3, 7),
+ sigma_limit: Union[float, Sequence[float]] = 0.0,
+ alpha: Union[float, Sequence[float]] = (0.2, 0.5),
+ threshold: int = 10,
+ always_apply=False,
+ p=0.5,
+ ):
+ super(UnsharpMask, self).__init__(always_apply, p)
+ self.blur_limit = to_tuple(blur_limit, 3)
+ self.sigma_limit = self.__check_values(to_tuple(sigma_limit, 0.0), name="sigma_limit")
+ self.alpha = self.__check_values(to_tuple(alpha, 0.0), name="alpha", bounds=(0.0, 1.0))
+ self.threshold = threshold
+
+ if self.blur_limit[0] == 0 and self.sigma_limit[0] == 0:
+ self.blur_limit = 3, max(3, self.blur_limit[1])
+ raise ValueError("blur_limit and sigma_limit minimum value can not be both equal to 0.")
+
+ if (self.blur_limit[0] != 0 and self.blur_limit[0] % 2 != 1) or (
+ self.blur_limit[1] != 0 and self.blur_limit[1] % 2 != 1
+ ):
+ raise ValueError("UnsharpMask supports only odd blur limits.")
+
+ @staticmethod
+ def __check_values(value, name, bounds=(0, float("inf"))):
+ if not bounds[0] <= value[0] <= value[1] <= bounds[1]:
+ raise ValueError(f"{name} values should be between {bounds}")
+ return value
+
+ def get_params(self):
+ return {
+ "ksize": random.randrange(self.blur_limit[0], self.blur_limit[1] + 1, 2),
+ "sigma": random.uniform(*self.sigma_limit),
+ "alpha": random.uniform(*self.alpha),
+ }
+
+ def apply(self, img, ksize=3, sigma=0, alpha=0.2, **params):
+ return F.unsharp_mask(img, ksize, sigma=sigma, alpha=alpha, threshold=self.threshold)
+
+ def get_transform_init_args_names(self):
+ return ("blur_limit", "sigma_limit", "alpha", "threshold")
+
+
+class PixelDropout(DualTransform):
+ """Set pixels to 0 with some probability.
+
+ Args:
+ dropout_prob (float): pixel drop probability. Default: 0.01
+ per_channel (bool): if set to `True` drop mask will be sampled fo each channel,
+ otherwise the same mask will be sampled for all channels. Default: False
+ drop_value (number or sequence of numbers or None): Value that will be set in dropped place.
+ If set to None value will be sampled randomly, default ranges will be used:
+ - uint8 - [0, 255]
+ - uint16 - [0, 65535]
+ - uint32 - [0, 4294967295]
+ - float, double - [0, 1]
+ Default: 0
+ mask_drop_value (number or sequence of numbers or None): Value that will be set in dropped place in masks.
+ If set to None masks will be unchanged. Default: 0
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask
+ Image types:
+ any
+ """
+
+ def __init__(
+ self,
+ dropout_prob: float = 0.01,
+ per_channel: bool = False,
+ drop_value: Optional[Union[float, Sequence[float]]] = 0,
+ mask_drop_value: Optional[Union[float, Sequence[float]]] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply, p)
+ self.dropout_prob = dropout_prob
+ self.per_channel = per_channel
+ self.drop_value = drop_value
+ self.mask_drop_value = mask_drop_value
+
+ if self.mask_drop_value is not None and self.per_channel:
+ raise ValueError("PixelDropout supports mask only with per_channel=False")
+
+ def apply(
+ self,
+ img: np.ndarray,
+ drop_mask: np.ndarray = np.array(None),
+ drop_value: Union[float, Sequence[float]] = (),
+ **params
+ ) -> np.ndarray:
+ return F.pixel_dropout(img, drop_mask, drop_value)
+
+ def apply_to_mask(self, img: np.ndarray, drop_mask: np.ndarray = np.array(None), **params) -> np.ndarray:
+ if self.mask_drop_value is None:
+ return img
+
+ if img.ndim == 2:
+ drop_mask = np.squeeze(drop_mask)
+
+ return F.pixel_dropout(img, drop_mask, self.mask_drop_value)
+
+ def apply_to_bbox(self, bbox, **params):
+ return bbox
+
+ def apply_to_keypoint(self, keypoint, **params):
+ return keypoint
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, Any]:
+ img = params["image"]
+ shape = img.shape if self.per_channel else img.shape[:2]
+
+ rnd = np.random.RandomState(random.randint(0, 1 << 31))
+ # Use choice to create boolean matrix, if we will use binomial after that we will need type conversion
+ drop_mask = rnd.choice([True, False], shape, p=[self.dropout_prob, 1 - self.dropout_prob])
+
+ drop_value: Union[float, Sequence[float], np.ndarray]
+ if drop_mask.ndim != img.ndim:
+ drop_mask = np.expand_dims(drop_mask, -1)
+ if self.drop_value is None:
+ drop_shape = 1 if is_grayscale_image(img) else int(img.shape[-1])
+
+ if img.dtype in (np.uint8, np.uint16, np.uint32):
+ drop_value = rnd.randint(0, int(F.MAX_VALUES_BY_DTYPE[img.dtype]), drop_shape, img.dtype)
+ elif img.dtype in [np.float32, np.double]:
+ drop_value = rnd.uniform(0, 1, drop_shape).astype(img.dtype)
+ else:
+ raise ValueError(f"Unsupported dtype: {img.dtype}")
+ else:
+ drop_value = self.drop_value
+
+ return {"drop_mask": drop_mask, "drop_value": drop_value}
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return ["image"]
+
+ def get_transform_init_args_names(self) -> Tuple[str, str, str, str]:
+ return ("dropout_prob", "per_channel", "drop_value", "mask_drop_value")
+
+
+class Spatter(ImageOnlyTransform):
+ """
+ Apply spatter transform. It simulates corruption which can occlude a lens in the form of rain or mud.
+
+ Args:
+ mean (float, or tuple of floats): Mean value of normal distribution for generating liquid layer.
+ If single float it will be used as mean.
+ If tuple of float mean will be sampled from range `[mean[0], mean[1])`. Default: (0.65).
+ std (float, or tuple of floats): Standard deviation value of normal distribution for generating liquid layer.
+ If single float it will be used as std.
+ If tuple of float std will be sampled from range `[std[0], std[1])`. Default: (0.3).
+ gauss_sigma (float, or tuple of floats): Sigma value for gaussian filtering of liquid layer.
+ If single float it will be used as gauss_sigma.
+ If tuple of float gauss_sigma will be sampled from range `[sigma[0], sigma[1])`. Default: (2).
+ cutout_threshold (float, or tuple of floats): Threshold for filtering liqued layer
+ (determines number of drops). If single float it will used as cutout_threshold.
+ If tuple of float cutout_threshold will be sampled from range `[cutout_threshold[0], cutout_threshold[1])`.
+ Default: (0.68).
+ intensity (float, or tuple of floats): Intensity of corruption.
+ If single float it will be used as intensity.
+ If tuple of float intensity will be sampled from range `[intensity[0], intensity[1])`. Default: (0.6).
+ mode (string, or list of strings): Type of corruption. Currently, supported options are 'rain' and 'mud'.
+ If list is provided type of corruption will be sampled list. Default: ("rain").
+ color (list of (r, g, b) or dict or None): Corruption elements color.
+ If list uses provided list as color for specified mode.
+ If dict uses provided color for specified mode. Color for each specified mode should be provided in dict.
+ If None uses default colors (rain: (238, 238, 175), mud: (20, 42, 63)).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+
+ Image types:
+ uint8, float32
+
+ Reference:
+ | https://arxiv.org/pdf/1903.12261.pdf
+ | https://github.com/hendrycks/robustness/blob/master/ImageNet-C/create_c/make_imagenet_c.py
+ """
+
+ def __init__(
+ self,
+ mean: ScaleFloatType = 0.65,
+ std: ScaleFloatType = 0.3,
+ gauss_sigma: ScaleFloatType = 2,
+ cutout_threshold: ScaleFloatType = 0.68,
+ intensity: ScaleFloatType = 0.6,
+ mode: Union[str, Sequence[str]] = "rain",
+ color: Optional[Union[Sequence[int], Dict[str, Sequence[int]]]] = None,
+ always_apply: bool = False,
+ p: float = 0.5,
+ ):
+ super().__init__(always_apply=always_apply, p=p)
+
+ self.mean = to_tuple(mean, mean)
+ self.std = to_tuple(std, std)
+ self.gauss_sigma = to_tuple(gauss_sigma, gauss_sigma)
+ self.intensity = to_tuple(intensity, intensity)
+ self.cutout_threshold = to_tuple(cutout_threshold, cutout_threshold)
+ self.color = (
+ color
+ if color is not None
+ else {
+ "rain": [238, 238, 175],
+ "mud": [20, 42, 63],
+ }
+ )
+ self.mode = mode if isinstance(mode, (list, tuple)) else [mode]
+
+ if len(set(self.mode)) > 1 and not isinstance(self.color, dict):
+ raise ValueError(f"Unsupported color: {self.color}. Please specify color for each mode (use dict for it).")
+
+ for i in self.mode:
+ if i not in ["rain", "mud"]:
+ raise ValueError(f"Unsupported color mode: {mode}. Transform supports only `rain` and `mud` mods.")
+ if isinstance(self.color, dict):
+ if i not in self.color:
+ raise ValueError(f"Wrong color definition: {self.color}. Color for mode: {i} not specified.")
+ if len(self.color[i]) != 3:
+ raise ValueError(
+ f"Unsupported color: {self.color[i]} for mode {i}. Color should be presented in RGB format."
+ )
+
+ if isinstance(self.color, (list, tuple)):
+ if len(self.color) != 3:
+ raise ValueError(f"Unsupported color: {self.color}. Color should be presented in RGB format.")
+ self.color = {self.mode[0]: self.color}
+
+ def apply(
+ self,
+ img: np.ndarray,
+ non_mud: Optional[np.ndarray] = None,
+ mud: Optional[np.ndarray] = None,
+ drops: Optional[np.ndarray] = None,
+ mode: str = "",
+ **params
+ ) -> np.ndarray:
+ return F.spatter(img, non_mud, mud, drops, mode)
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return ["image"]
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, Any]:
+ h, w = params["image"].shape[:2]
+
+ mean = random.uniform(self.mean[0], self.mean[1])
+ std = random.uniform(self.std[0], self.std[1])
+ cutout_threshold = random.uniform(self.cutout_threshold[0], self.cutout_threshold[1])
+ sigma = random.uniform(self.gauss_sigma[0], self.gauss_sigma[1])
+ mode = random.choice(self.mode)
+ intensity = random.uniform(self.intensity[0], self.intensity[1])
+ color = np.array(self.color[mode]) / 255.0
+
+ liquid_layer = random_utils.normal(size=(h, w), loc=mean, scale=std)
+ liquid_layer = gaussian_filter(liquid_layer, sigma=sigma, mode="nearest")
+ liquid_layer[liquid_layer < cutout_threshold] = 0
+
+ if mode == "rain":
+ liquid_layer = (liquid_layer * 255).astype(np.uint8)
+ dist = 255 - cv2.Canny(liquid_layer, 50, 150)
+ dist = cv2.distanceTransform(dist, cv2.DIST_L2, 5)
+ _, dist = cv2.threshold(dist, 20, 20, cv2.THRESH_TRUNC)
+ dist = blur(dist, 3).astype(np.uint8)
+ dist = F.equalize(dist)
+
+ ker = np.array([[-2, -1, 0], [-1, 1, 1], [0, 1, 2]])
+ dist = F.convolve(dist, ker)
+ dist = blur(dist, 3).astype(np.float32)
+
+ m = liquid_layer * dist
+ m *= 1 / np.max(m, axis=(0, 1))
+
+ drops = m[:, :, None] * color * intensity
+ mud = None
+ non_mud = None
+ else:
+ m = np.where(liquid_layer > cutout_threshold, 1, 0)
+ m = gaussian_filter(m.astype(np.float32), sigma=sigma, mode="nearest")
+ m[m < 1.2 * cutout_threshold] = 0
+ m = m[..., np.newaxis]
+
+ mud = m * color
+ non_mud = 1 - m
+ drops = None
+
+ return {
+ "non_mud": non_mud,
+ "mud": mud,
+ "drops": drops,
+ "mode": mode,
+ }
+
+ def get_transform_init_args_names(self) -> Tuple[str, str, str, str, str, str, str]:
+ return "mean", "std", "gauss_sigma", "intensity", "cutout_threshold", "mode", "color"
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/augmentations/utils.py b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1460549d07cb57c4b41e97a4de6af5074a60e112
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/augmentations/utils.py
@@ -0,0 +1,211 @@
+from functools import wraps
+from typing import Callable, Union
+
+import cv2
+import numpy as np
+from typing_extensions import Concatenate, ParamSpec
+
+from custom_albumentations.core.keypoints_utils import angle_to_2pi_range
+from custom_albumentations.core.transforms_interface import KeypointInternalType
+
+__all__ = [
+ "read_bgr_image",
+ "read_rgb_image",
+ "MAX_VALUES_BY_DTYPE",
+ "NPDTYPE_TO_OPENCV_DTYPE",
+ "clipped",
+ "get_opencv_dtype_from_numpy",
+ "angle_2pi_range",
+ "clip",
+ "preserve_shape",
+ "preserve_channel_dim",
+ "ensure_contiguous",
+ "is_rgb_image",
+ "is_grayscale_image",
+ "is_multispectral_image",
+ "get_num_channels",
+ "non_rgb_warning",
+ "_maybe_process_in_chunks",
+]
+
+P = ParamSpec("P")
+
+MAX_VALUES_BY_DTYPE = {
+ np.dtype("uint8"): 255,
+ np.dtype("uint16"): 65535,
+ np.dtype("uint32"): 4294967295,
+ np.dtype("float32"): 1.0,
+}
+
+NPDTYPE_TO_OPENCV_DTYPE = {
+ np.uint8: cv2.CV_8U, # type: ignore[attr-defined]
+ np.uint16: cv2.CV_16U, # type: ignore[attr-defined]
+ np.int32: cv2.CV_32S, # type: ignore[attr-defined]
+ np.float32: cv2.CV_32F, # type: ignore[attr-defined]
+ np.float64: cv2.CV_64F, # type: ignore[attr-defined]
+ np.dtype("uint8"): cv2.CV_8U, # type: ignore[attr-defined]
+ np.dtype("uint16"): cv2.CV_16U, # type: ignore[attr-defined]
+ np.dtype("int32"): cv2.CV_32S, # type: ignore[attr-defined]
+ np.dtype("float32"): cv2.CV_32F, # type: ignore[attr-defined]
+ np.dtype("float64"): cv2.CV_64F, # type: ignore[attr-defined]
+}
+
+
+def read_bgr_image(path):
+ return cv2.imread(path, cv2.IMREAD_COLOR)
+
+
+def read_rgb_image(path):
+ image = cv2.imread(path, cv2.IMREAD_COLOR)
+ return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+
+def clipped(func: Callable[Concatenate[np.ndarray, P], np.ndarray]) -> Callable[Concatenate[np.ndarray, P], np.ndarray]:
+ @wraps(func)
+ def wrapped_function(img: np.ndarray, *args: P.args, **kwargs: P.kwargs) -> np.ndarray:
+ dtype = img.dtype
+ maxval = MAX_VALUES_BY_DTYPE.get(dtype, 1.0)
+ return clip(func(img, *args, **kwargs), dtype, maxval)
+
+ return wrapped_function
+
+
+def clip(img: np.ndarray, dtype: np.dtype, maxval: float) -> np.ndarray:
+ return np.clip(img, 0, maxval).astype(dtype)
+
+
+def get_opencv_dtype_from_numpy(value: Union[np.ndarray, int, np.dtype, object]) -> int:
+ """
+ Return a corresponding OpenCV dtype for a numpy's dtype
+ :param value: Input dtype of numpy array
+ :return: Corresponding dtype for OpenCV
+ """
+ if isinstance(value, np.ndarray):
+ value = value.dtype
+ return NPDTYPE_TO_OPENCV_DTYPE[value]
+
+
+def angle_2pi_range(
+ func: Callable[Concatenate[KeypointInternalType, P], KeypointInternalType]
+) -> Callable[Concatenate[KeypointInternalType, P], KeypointInternalType]:
+ @wraps(func)
+ def wrapped_function(keypoint: KeypointInternalType, *args: P.args, **kwargs: P.kwargs) -> KeypointInternalType:
+ (x, y, a, s) = func(keypoint, *args, **kwargs)[:4]
+ return (x, y, angle_to_2pi_range(a), s)
+
+ return wrapped_function
+
+
+def preserve_shape(
+ func: Callable[Concatenate[np.ndarray, P], np.ndarray]
+) -> Callable[Concatenate[np.ndarray, P], np.ndarray]:
+ """Preserve shape of the image"""
+
+ @wraps(func)
+ def wrapped_function(img: np.ndarray, *args: P.args, **kwargs: P.kwargs) -> np.ndarray:
+ shape = img.shape
+ result = func(img, *args, **kwargs)
+ result = result.reshape(shape)
+ return result
+
+ return wrapped_function
+
+
+def preserve_channel_dim(
+ func: Callable[Concatenate[np.ndarray, P], np.ndarray]
+) -> Callable[Concatenate[np.ndarray, P], np.ndarray]:
+ """Preserve dummy channel dim."""
+
+ @wraps(func)
+ def wrapped_function(img: np.ndarray, *args: P.args, **kwargs: P.kwargs) -> np.ndarray:
+ shape = img.shape
+ result = func(img, *args, **kwargs)
+ if len(shape) == 3 and shape[-1] == 1 and len(result.shape) == 2:
+ result = np.expand_dims(result, axis=-1)
+ return result
+
+ return wrapped_function
+
+
+def ensure_contiguous(
+ func: Callable[Concatenate[np.ndarray, P], np.ndarray]
+) -> Callable[Concatenate[np.ndarray, P], np.ndarray]:
+ """Ensure that input img is contiguous."""
+
+ @wraps(func)
+ def wrapped_function(img: np.ndarray, *args: P.args, **kwargs: P.kwargs) -> np.ndarray:
+ img = np.require(img, requirements=["C_CONTIGUOUS"])
+ result = func(img, *args, **kwargs)
+ return result
+
+ return wrapped_function
+
+
+def is_rgb_image(image: np.ndarray) -> bool:
+ return len(image.shape) == 3 and image.shape[-1] == 3
+
+
+def is_grayscale_image(image: np.ndarray) -> bool:
+ return (len(image.shape) == 2) or (len(image.shape) == 3 and image.shape[-1] == 1)
+
+
+def is_multispectral_image(image: np.ndarray) -> bool:
+ return len(image.shape) == 3 and image.shape[-1] not in [1, 3]
+
+
+def get_num_channels(image: np.ndarray) -> int:
+ return image.shape[2] if len(image.shape) == 3 else 1
+
+
+def non_rgb_warning(image: np.ndarray) -> None:
+ if not is_rgb_image(image):
+ message = "This transformation expects 3-channel images"
+ if is_grayscale_image(image):
+ message += "\nYou can convert your grayscale image to RGB using cv2.cvtColor(image, cv2.COLOR_GRAY2RGB))"
+ if is_multispectral_image(image): # Any image with a number of channels other than 1 and 3
+ message += "\nThis transformation cannot be applied to multi-spectral images"
+
+ raise ValueError(message)
+
+
+def _maybe_process_in_chunks(
+ process_fn: Callable[Concatenate[np.ndarray, P], np.ndarray], **kwargs
+) -> Callable[[np.ndarray], np.ndarray]:
+ """
+ Wrap OpenCV function to enable processing images with more than 4 channels.
+
+ Limitations:
+ This wrapper requires image to be the first argument and rest must be sent via named arguments.
+
+ Args:
+ process_fn: Transform function (e.g cv2.resize).
+ kwargs: Additional parameters.
+
+ Returns:
+ numpy.ndarray: Transformed image.
+
+ """
+
+ @wraps(process_fn)
+ def __process_fn(img: np.ndarray) -> np.ndarray:
+ num_channels = get_num_channels(img)
+ if num_channels > 4:
+ chunks = []
+ for index in range(0, num_channels, 4):
+ if num_channels - index == 2:
+ # Many OpenCV functions cannot work with 2-channel images
+ for i in range(2):
+ chunk = img[:, :, index + i : index + i + 1]
+ chunk = process_fn(chunk, **kwargs)
+ chunk = np.expand_dims(chunk, -1)
+ chunks.append(chunk)
+ else:
+ chunk = img[:, :, index : index + 4]
+ chunk = process_fn(chunk, **kwargs)
+ chunks.append(chunk)
+ img = np.dstack(chunks)
+ else:
+ img = process_fn(img, **kwargs)
+ return img
+
+ return __process_fn
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/bbox_utils.py b/comfyui_controlnet_aux/src/custom_albumentations/core/bbox_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f48d25b8c339636a6218e89fcda4a83de95828cb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/core/bbox_utils.py
@@ -0,0 +1,522 @@
+from __future__ import division
+
+from typing import Any, Dict, List, Optional, Sequence, Tuple, TypeVar, cast
+
+import numpy as np
+
+from .transforms_interface import BoxInternalType, BoxType
+from .utils import DataProcessor, Params
+
+__all__ = [
+ "normalize_bbox",
+ "denormalize_bbox",
+ "normalize_bboxes",
+ "denormalize_bboxes",
+ "calculate_bbox_area",
+ "filter_bboxes_by_visibility",
+ "convert_bbox_to_albumentations",
+ "convert_bbox_from_albumentations",
+ "convert_bboxes_to_albumentations",
+ "convert_bboxes_from_albumentations",
+ "check_bbox",
+ "check_bboxes",
+ "filter_bboxes",
+ "union_of_bboxes",
+ "BboxProcessor",
+ "BboxParams",
+]
+
+TBox = TypeVar("TBox", BoxType, BoxInternalType)
+
+
+class BboxParams(Params):
+ """
+ Parameters of bounding boxes
+
+ Args:
+ format (str): format of bounding boxes. Should be 'coco', 'pascal_voc', 'albumentations' or 'yolo'.
+
+ The `coco` format
+ `[x_min, y_min, width, height]`, e.g. [97, 12, 150, 200].
+ The `pascal_voc` format
+ `[x_min, y_min, x_max, y_max]`, e.g. [97, 12, 247, 212].
+ The `albumentations` format
+ is like `pascal_voc`, but normalized,
+ in other words: `[x_min, y_min, x_max, y_max]`, e.g. [0.2, 0.3, 0.4, 0.5].
+ The `yolo` format
+ `[x, y, width, height]`, e.g. [0.1, 0.2, 0.3, 0.4];
+ `x`, `y` - normalized bbox center; `width`, `height` - normalized bbox width and height.
+ label_fields (list): list of fields that are joined with boxes, e.g labels.
+ Should be same type as boxes.
+ min_area (float): minimum area of a bounding box. All bounding boxes whose
+ visible area in pixels is less than this value will be removed. Default: 0.0.
+ min_visibility (float): minimum fraction of area for a bounding box
+ to remain this box in list. Default: 0.0.
+ min_width (float): Minimum width of a bounding box. All bounding boxes whose width is
+ less than this value will be removed. Default: 0.0.
+ min_height (float): Minimum height of a bounding box. All bounding boxes whose height is
+ less than this value will be removed. Default: 0.0.
+ check_each_transform (bool): if `True`, then bboxes will be checked after each dual transform.
+ Default: `True`
+ """
+
+ def __init__(
+ self,
+ format: str,
+ label_fields: Optional[Sequence[str]] = None,
+ min_area: float = 0.0,
+ min_visibility: float = 0.0,
+ min_width: float = 0.0,
+ min_height: float = 0.0,
+ check_each_transform: bool = True,
+ ):
+ super(BboxParams, self).__init__(format, label_fields)
+ self.min_area = min_area
+ self.min_visibility = min_visibility
+ self.min_width = min_width
+ self.min_height = min_height
+ self.check_each_transform = check_each_transform
+
+ def _to_dict(self) -> Dict[str, Any]:
+ data = super(BboxParams, self)._to_dict()
+ data.update(
+ {
+ "min_area": self.min_area,
+ "min_visibility": self.min_visibility,
+ "min_width": self.min_width,
+ "min_height": self.min_height,
+ "check_each_transform": self.check_each_transform,
+ }
+ )
+ return data
+
+ @classmethod
+ def is_serializable(cls) -> bool:
+ return True
+
+ @classmethod
+ def get_class_fullname(cls) -> str:
+ return "BboxParams"
+
+
+class BboxProcessor(DataProcessor):
+ def __init__(self, params: BboxParams, additional_targets: Optional[Dict[str, str]] = None):
+ super().__init__(params, additional_targets)
+
+ @property
+ def default_data_name(self) -> str:
+ return "bboxes"
+
+ def ensure_data_valid(self, data: Dict[str, Any]) -> None:
+ for data_name in self.data_fields:
+ data_exists = data_name in data and len(data[data_name])
+ if data_exists and len(data[data_name][0]) < 5:
+ if self.params.label_fields is None:
+ raise ValueError(
+ "Please specify 'label_fields' in 'bbox_params' or add labels to the end of bbox "
+ "because bboxes must have labels"
+ )
+ if self.params.label_fields:
+ if not all(i in data.keys() for i in self.params.label_fields):
+ raise ValueError("Your 'label_fields' are not valid - them must have same names as params in dict")
+
+ def filter(self, data: Sequence, rows: int, cols: int) -> List:
+ self.params: BboxParams
+ return filter_bboxes(
+ data,
+ rows,
+ cols,
+ min_area=self.params.min_area,
+ min_visibility=self.params.min_visibility,
+ min_width=self.params.min_width,
+ min_height=self.params.min_height,
+ )
+
+ def check(self, data: Sequence, rows: int, cols: int) -> None:
+ check_bboxes(data)
+
+ def convert_from_albumentations(self, data: Sequence, rows: int, cols: int) -> List[BoxType]:
+ return convert_bboxes_from_albumentations(data, self.params.format, rows, cols, check_validity=True)
+
+ def convert_to_albumentations(self, data: Sequence[BoxType], rows: int, cols: int) -> List[BoxType]:
+ return convert_bboxes_to_albumentations(data, self.params.format, rows, cols, check_validity=True)
+
+
+def normalize_bbox(bbox: TBox, rows: int, cols: int) -> TBox:
+ """Normalize coordinates of a bounding box. Divide x-coordinates by image width and y-coordinates
+ by image height.
+
+ Args:
+ bbox: Denormalized bounding box `(x_min, y_min, x_max, y_max)`.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ Normalized bounding box `(x_min, y_min, x_max, y_max)`.
+
+ Raises:
+ ValueError: If rows or cols is less or equal zero
+
+ """
+
+ if rows <= 0:
+ raise ValueError("Argument rows must be positive integer")
+ if cols <= 0:
+ raise ValueError("Argument cols must be positive integer")
+
+ tail: Tuple[Any, ...]
+ (x_min, y_min, x_max, y_max), tail = bbox[:4], tuple(bbox[4:])
+
+ x_min, x_max = x_min / cols, x_max / cols
+ y_min, y_max = y_min / rows, y_max / rows
+
+ return cast(BoxType, (x_min, y_min, x_max, y_max) + tail) # type: ignore
+
+
+def denormalize_bbox(bbox: TBox, rows: int, cols: int) -> TBox:
+ """Denormalize coordinates of a bounding box. Multiply x-coordinates by image width and y-coordinates
+ by image height. This is an inverse operation for :func:`~albumentations.augmentations.bbox.normalize_bbox`.
+
+ Args:
+ bbox: Normalized bounding box `(x_min, y_min, x_max, y_max)`.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ Denormalized bounding box `(x_min, y_min, x_max, y_max)`.
+
+ Raises:
+ ValueError: If rows or cols is less or equal zero
+
+ """
+ tail: Tuple[Any, ...]
+ (x_min, y_min, x_max, y_max), tail = bbox[:4], tuple(bbox[4:])
+
+ if rows <= 0:
+ raise ValueError("Argument rows must be positive integer")
+ if cols <= 0:
+ raise ValueError("Argument cols must be positive integer")
+
+ x_min, x_max = x_min * cols, x_max * cols
+ y_min, y_max = y_min * rows, y_max * rows
+
+ return cast(BoxType, (x_min, y_min, x_max, y_max) + tail) # type: ignore
+
+
+def normalize_bboxes(bboxes: Sequence[BoxType], rows: int, cols: int) -> List[BoxType]:
+ """Normalize a list of bounding boxes.
+
+ Args:
+ bboxes: Denormalized bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ Normalized bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+
+ """
+ return [normalize_bbox(bbox, rows, cols) for bbox in bboxes]
+
+
+def denormalize_bboxes(bboxes: Sequence[BoxType], rows: int, cols: int) -> List[BoxType]:
+ """Denormalize a list of bounding boxes.
+
+ Args:
+ bboxes: Normalized bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ List: Denormalized bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+
+ """
+ return [denormalize_bbox(bbox, rows, cols) for bbox in bboxes]
+
+
+def calculate_bbox_area(bbox: BoxType, rows: int, cols: int) -> float:
+ """Calculate the area of a bounding box in (fractional) pixels.
+
+ Args:
+ bbox: A bounding box `(x_min, y_min, x_max, y_max)`.
+ rows: Image height.
+ cols: Image width.
+
+ Return:
+ Area in (fractional) pixels of the (denormalized) bounding box.
+
+ """
+ bbox = denormalize_bbox(bbox, rows, cols)
+ x_min, y_min, x_max, y_max = bbox[:4]
+ area = (x_max - x_min) * (y_max - y_min)
+ return area
+
+
+def filter_bboxes_by_visibility(
+ original_shape: Sequence[int],
+ bboxes: Sequence[BoxType],
+ transformed_shape: Sequence[int],
+ transformed_bboxes: Sequence[BoxType],
+ threshold: float = 0.0,
+ min_area: float = 0.0,
+) -> List[BoxType]:
+ """Filter bounding boxes and return only those boxes whose visibility after transformation is above
+ the threshold and minimal area of bounding box in pixels is more then min_area.
+
+ Args:
+ original_shape: Original image shape `(height, width, ...)`.
+ bboxes: Original bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+ transformed_shape: Transformed image shape `(height, width)`.
+ transformed_bboxes: Transformed bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+ threshold: visibility threshold. Should be a value in the range [0.0, 1.0].
+ min_area: Minimal area threshold.
+
+ Returns:
+ Filtered bounding boxes `[(x_min, y_min, x_max, y_max)]`.
+
+ """
+ img_height, img_width = original_shape[:2]
+ transformed_img_height, transformed_img_width = transformed_shape[:2]
+
+ visible_bboxes = []
+ for bbox, transformed_bbox in zip(bboxes, transformed_bboxes):
+ if not all(0.0 <= value <= 1.0 for value in transformed_bbox[:4]):
+ continue
+ bbox_area = calculate_bbox_area(bbox, img_height, img_width)
+ transformed_bbox_area = calculate_bbox_area(transformed_bbox, transformed_img_height, transformed_img_width)
+ if transformed_bbox_area < min_area:
+ continue
+ visibility = transformed_bbox_area / bbox_area
+ if visibility >= threshold:
+ visible_bboxes.append(transformed_bbox)
+ return visible_bboxes
+
+
+def convert_bbox_to_albumentations(
+ bbox: BoxType, source_format: str, rows: int, cols: int, check_validity: bool = False
+) -> BoxType:
+ """Convert a bounding box from a format specified in `source_format` to the format used by albumentations:
+ normalized coordinates of top-left and bottom-right corners of the bounding box in a form of
+ `(x_min, y_min, x_max, y_max)` e.g. `(0.15, 0.27, 0.67, 0.5)`.
+
+ Args:
+ bbox: A bounding box tuple.
+ source_format: format of the bounding box. Should be 'coco', 'pascal_voc', or 'yolo'.
+ check_validity: Check if all boxes are valid boxes.
+ rows: Image height.
+ cols: Image width.
+
+ Returns:
+ tuple: A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ Note:
+ The `coco` format of a bounding box looks like `(x_min, y_min, width, height)`, e.g. (97, 12, 150, 200).
+ The `pascal_voc` format of a bounding box looks like `(x_min, y_min, x_max, y_max)`, e.g. (97, 12, 247, 212).
+ The `yolo` format of a bounding box looks like `(x, y, width, height)`, e.g. (0.3, 0.1, 0.05, 0.07);
+ where `x`, `y` coordinates of the center of the box, all values normalized to 1 by image height and width.
+
+ Raises:
+ ValueError: if `target_format` is not equal to `coco` or `pascal_voc`, or `yolo`.
+ ValueError: If in YOLO format all labels not in range (0, 1).
+
+ """
+ if source_format not in {"coco", "pascal_voc", "yolo"}:
+ raise ValueError(
+ f"Unknown source_format {source_format}. Supported formats are: 'coco', 'pascal_voc' and 'yolo'"
+ )
+
+ if source_format == "coco":
+ (x_min, y_min, width, height), tail = bbox[:4], bbox[4:]
+ x_max = x_min + width
+ y_max = y_min + height
+ elif source_format == "yolo":
+ # https://github.com/pjreddie/darknet/blob/f6d861736038da22c9eb0739dca84003c5a5e275/scripts/voc_label.py#L12
+ _bbox = np.array(bbox[:4])
+ if check_validity and np.any((_bbox <= 0) | (_bbox > 1)):
+ raise ValueError("In YOLO format all coordinates must be float and in range (0, 1]")
+
+ (x, y, w, h), tail = bbox[:4], bbox[4:]
+
+ w_half, h_half = w / 2, h / 2
+ x_min = x - w_half
+ y_min = y - h_half
+ x_max = x_min + w
+ y_max = y_min + h
+ else:
+ (x_min, y_min, x_max, y_max), tail = bbox[:4], bbox[4:]
+
+ bbox = (x_min, y_min, x_max, y_max) + tuple(tail) # type: ignore
+
+ if source_format != "yolo":
+ bbox = normalize_bbox(bbox, rows, cols)
+ if check_validity:
+ check_bbox(bbox)
+ return bbox
+
+
+def convert_bbox_from_albumentations(
+ bbox: BoxType, target_format: str, rows: int, cols: int, check_validity: bool = False
+) -> BoxType:
+ """Convert a bounding box from the format used by albumentations to a format, specified in `target_format`.
+
+ Args:
+ bbox: An albumentations bounding box `(x_min, y_min, x_max, y_max)`.
+ target_format: required format of the output bounding box. Should be 'coco', 'pascal_voc' or 'yolo'.
+ rows: Image height.
+ cols: Image width.
+ check_validity: Check if all boxes are valid boxes.
+
+ Returns:
+ tuple: A bounding box.
+
+ Note:
+ The `coco` format of a bounding box looks like `[x_min, y_min, width, height]`, e.g. [97, 12, 150, 200].
+ The `pascal_voc` format of a bounding box looks like `[x_min, y_min, x_max, y_max]`, e.g. [97, 12, 247, 212].
+ The `yolo` format of a bounding box looks like `[x, y, width, height]`, e.g. [0.3, 0.1, 0.05, 0.07].
+
+ Raises:
+ ValueError: if `target_format` is not equal to `coco`, `pascal_voc` or `yolo`.
+
+ """
+ if target_format not in {"coco", "pascal_voc", "yolo"}:
+ raise ValueError(
+ f"Unknown target_format {target_format}. Supported formats are: 'coco', 'pascal_voc' and 'yolo'"
+ )
+ if check_validity:
+ check_bbox(bbox)
+
+ if target_format != "yolo":
+ bbox = denormalize_bbox(bbox, rows, cols)
+ if target_format == "coco":
+ (x_min, y_min, x_max, y_max), tail = bbox[:4], tuple(bbox[4:])
+ width = x_max - x_min
+ height = y_max - y_min
+ bbox = cast(BoxType, (x_min, y_min, width, height) + tail)
+ elif target_format == "yolo":
+ (x_min, y_min, x_max, y_max), tail = bbox[:4], bbox[4:]
+ x = (x_min + x_max) / 2.0
+ y = (y_min + y_max) / 2.0
+ w = x_max - x_min
+ h = y_max - y_min
+ bbox = cast(BoxType, (x, y, w, h) + tail)
+ return bbox
+
+
+def convert_bboxes_to_albumentations(
+ bboxes: Sequence[BoxType], source_format, rows, cols, check_validity=False
+) -> List[BoxType]:
+ """Convert a list bounding boxes from a format specified in `source_format` to the format used by albumentations"""
+ return [convert_bbox_to_albumentations(bbox, source_format, rows, cols, check_validity) for bbox in bboxes]
+
+
+def convert_bboxes_from_albumentations(
+ bboxes: Sequence[BoxType], target_format: str, rows: int, cols: int, check_validity: bool = False
+) -> List[BoxType]:
+ """Convert a list of bounding boxes from the format used by albumentations to a format, specified
+ in `target_format`.
+
+ Args:
+ bboxes: List of albumentation bounding box `(x_min, y_min, x_max, y_max)`.
+ target_format: required format of the output bounding box. Should be 'coco', 'pascal_voc' or 'yolo'.
+ rows: Image height.
+ cols: Image width.
+ check_validity: Check if all boxes are valid boxes.
+
+ Returns:
+ List of bounding boxes.
+
+ """
+ return [convert_bbox_from_albumentations(bbox, target_format, rows, cols, check_validity) for bbox in bboxes]
+
+
+def check_bbox(bbox: BoxType) -> None:
+ """Check if bbox boundaries are in range 0, 1 and minimums are lesser then maximums"""
+ for name, value in zip(["x_min", "y_min", "x_max", "y_max"], bbox[:4]):
+ if not 0 <= value <= 1 and not np.isclose(value, 0) and not np.isclose(value, 1):
+ raise ValueError(f"Expected {name} for bbox {bbox} to be in the range [0.0, 1.0], got {value}.")
+ x_min, y_min, x_max, y_max = bbox[:4]
+ if x_max <= x_min:
+ raise ValueError(f"x_max is less than or equal to x_min for bbox {bbox}.")
+ if y_max <= y_min:
+ raise ValueError(f"y_max is less than or equal to y_min for bbox {bbox}.")
+
+
+def check_bboxes(bboxes: Sequence[BoxType]) -> None:
+ """Check if bboxes boundaries are in range 0, 1 and minimums are lesser then maximums"""
+ for bbox in bboxes:
+ check_bbox(bbox)
+
+
+def filter_bboxes(
+ bboxes: Sequence[BoxType],
+ rows: int,
+ cols: int,
+ min_area: float = 0.0,
+ min_visibility: float = 0.0,
+ min_width: float = 0.0,
+ min_height: float = 0.0,
+) -> List[BoxType]:
+ """Remove bounding boxes that either lie outside of the visible area by more then min_visibility
+ or whose area in pixels is under the threshold set by `min_area`. Also it crops boxes to final image size.
+
+ Args:
+ bboxes: List of albumentation bounding box `(x_min, y_min, x_max, y_max)`.
+ rows: Image height.
+ cols: Image width.
+ min_area: Minimum area of a bounding box. All bounding boxes whose visible area in pixels.
+ is less than this value will be removed. Default: 0.0.
+ min_visibility: Minimum fraction of area for a bounding box to remain this box in list. Default: 0.0.
+ min_width: Minimum width of a bounding box. All bounding boxes whose width is
+ less than this value will be removed. Default: 0.0.
+ min_height: Minimum height of a bounding box. All bounding boxes whose height is
+ less than this value will be removed. Default: 0.0.
+
+ Returns:
+ List of bounding boxes.
+
+ """
+ resulting_boxes: List[BoxType] = []
+ for bbox in bboxes:
+ # Calculate areas of bounding box before and after clipping.
+ transformed_box_area = calculate_bbox_area(bbox, rows, cols)
+ bbox, tail = cast(BoxType, tuple(np.clip(bbox[:4], 0, 1.0))), tuple(bbox[4:])
+ clipped_box_area = calculate_bbox_area(bbox, rows, cols)
+
+ # Calculate width and height of the clipped bounding box.
+ x_min, y_min, x_max, y_max = denormalize_bbox(bbox, rows, cols)[:4]
+ clipped_width, clipped_height = x_max - x_min, y_max - y_min
+
+ if (
+ clipped_box_area != 0 # to ensure transformed_box_area!=0 and to handle min_area=0 or min_visibility=0
+ and clipped_box_area >= min_area
+ and clipped_box_area / transformed_box_area >= min_visibility
+ and clipped_width >= min_width
+ and clipped_height >= min_height
+ ):
+ resulting_boxes.append(cast(BoxType, bbox + tail))
+ return resulting_boxes
+
+
+def union_of_bboxes(height: int, width: int, bboxes: Sequence[BoxType], erosion_rate: float = 0.0) -> BoxType:
+ """Calculate union of bounding boxes.
+
+ Args:
+ height (float): Height of image or space.
+ width (float): Width of image or space.
+ bboxes (List[tuple]): List like bounding boxes. Format is `[(x_min, y_min, x_max, y_max)]`.
+ erosion_rate (float): How much each bounding box can be shrinked, useful for erosive cropping.
+ Set this in range [0, 1]. 0 will not be erosive at all, 1.0 can make any bbox to lose its volume.
+
+ Returns:
+ tuple: A bounding box `(x_min, y_min, x_max, y_max)`.
+
+ """
+ x1, y1 = width, height
+ x2, y2 = 0, 0
+ for bbox in bboxes:
+ x_min, y_min, x_max, y_max = bbox[:4]
+ w, h = x_max - x_min, y_max - y_min
+ lim_x1, lim_y1 = x_min + erosion_rate * w, y_min + erosion_rate * h
+ lim_x2, lim_y2 = x_max - erosion_rate * w, y_max - erosion_rate * h
+ x1, y1 = np.min([x1, lim_x1]), np.min([y1, lim_y1])
+ x2, y2 = np.max([x2, lim_x2]), np.max([y2, lim_y2])
+ return x1, y1, x2, y2
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/composition.py b/comfyui_controlnet_aux/src/custom_albumentations/core/composition.py
new file mode 100644
index 0000000000000000000000000000000000000000..c12f8d2d02d8b43811f5c62053277a609d1b031e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/core/composition.py
@@ -0,0 +1,552 @@
+from __future__ import division
+
+import random
+import typing
+import warnings
+from collections import defaultdict
+
+import numpy as np
+
+from .. import random_utils
+from .bbox_utils import BboxParams, BboxProcessor
+from .keypoints_utils import KeypointParams, KeypointsProcessor
+from .serialization import (
+ SERIALIZABLE_REGISTRY,
+ Serializable,
+ get_shortest_class_fullname,
+ instantiate_nonserializable,
+)
+from .transforms_interface import BasicTransform
+from .utils import format_args, get_shape
+
+__all__ = [
+ "BaseCompose",
+ "Compose",
+ "SomeOf",
+ "OneOf",
+ "OneOrOther",
+ "BboxParams",
+ "KeypointParams",
+ "ReplayCompose",
+ "Sequential",
+]
+
+
+REPR_INDENT_STEP = 2
+TransformType = typing.Union[BasicTransform, "BaseCompose"]
+TransformsSeqType = typing.Sequence[TransformType]
+
+
+def get_always_apply(transforms: typing.Union["BaseCompose", TransformsSeqType]) -> TransformsSeqType:
+ new_transforms: typing.List[TransformType] = []
+ for transform in transforms: # type: ignore
+ if isinstance(transform, BaseCompose):
+ new_transforms.extend(get_always_apply(transform))
+ elif transform.always_apply:
+ new_transforms.append(transform)
+ return new_transforms
+
+
+class BaseCompose(Serializable):
+ def __init__(self, transforms: TransformsSeqType, p: float):
+ if isinstance(transforms, (BaseCompose, BasicTransform)):
+ warnings.warn(
+ "transforms is single transform, but a sequence is expected! Transform will be wrapped into list."
+ )
+ transforms = [transforms]
+
+ self.transforms = transforms
+ self.p = p
+
+ self.replay_mode = False
+ self.applied_in_replay = False
+
+ def __len__(self) -> int:
+ return len(self.transforms)
+
+ def __call__(self, *args, **data) -> typing.Dict[str, typing.Any]:
+ raise NotImplementedError
+
+ def __getitem__(self, item: int) -> TransformType: # type: ignore
+ return self.transforms[item]
+
+ def __repr__(self) -> str:
+ return self.indented_repr()
+
+ def indented_repr(self, indent: int = REPR_INDENT_STEP) -> str:
+ args = {k: v for k, v in self._to_dict().items() if not (k.startswith("__") or k == "transforms")}
+ repr_string = self.__class__.__name__ + "(["
+ for t in self.transforms:
+ repr_string += "\n"
+ if hasattr(t, "indented_repr"):
+ t_repr = t.indented_repr(indent + REPR_INDENT_STEP) # type: ignore
+ else:
+ t_repr = repr(t)
+ repr_string += " " * indent + t_repr + ","
+ repr_string += "\n" + " " * (indent - REPR_INDENT_STEP) + "], {args})".format(args=format_args(args))
+ return repr_string
+
+ @classmethod
+ def get_class_fullname(cls) -> str:
+ return get_shortest_class_fullname(cls)
+
+ @classmethod
+ def is_serializable(cls) -> bool:
+ return True
+
+ def _to_dict(self) -> typing.Dict[str, typing.Any]:
+ return {
+ "__class_fullname__": self.get_class_fullname(),
+ "p": self.p,
+ "transforms": [t._to_dict() for t in self.transforms], # skipcq: PYL-W0212
+ }
+
+ def get_dict_with_id(self) -> typing.Dict[str, typing.Any]:
+ return {
+ "__class_fullname__": self.get_class_fullname(),
+ "id": id(self),
+ "params": None,
+ "transforms": [t.get_dict_with_id() for t in self.transforms],
+ }
+
+ def add_targets(self, additional_targets: typing.Optional[typing.Dict[str, str]]) -> None:
+ if additional_targets:
+ for t in self.transforms:
+ t.add_targets(additional_targets)
+
+ def set_deterministic(self, flag: bool, save_key: str = "replay") -> None:
+ for t in self.transforms:
+ t.set_deterministic(flag, save_key)
+
+
+class Compose(BaseCompose):
+ """Compose transforms and handle all transformations regarding bounding boxes
+
+ Args:
+ transforms (list): list of transformations to compose.
+ bbox_params (BboxParams): Parameters for bounding boxes transforms
+ keypoint_params (KeypointParams): Parameters for keypoints transforms
+ additional_targets (dict): Dict with keys - new target name, values - old target name. ex: {'image2': 'image'}
+ p (float): probability of applying all list of transforms. Default: 1.0.
+ is_check_shapes (bool): If True shapes consistency of images/mask/masks would be checked on each call. If you
+ would like to disable this check - pass False (do it only if you are sure in your data consistency).
+ """
+
+ def __init__(
+ self,
+ transforms: TransformsSeqType,
+ bbox_params: typing.Optional[typing.Union[dict, "BboxParams"]] = None,
+ keypoint_params: typing.Optional[typing.Union[dict, "KeypointParams"]] = None,
+ additional_targets: typing.Optional[typing.Dict[str, str]] = None,
+ p: float = 1.0,
+ is_check_shapes: bool = True,
+ ):
+ super(Compose, self).__init__(transforms, p)
+
+ self.processors: typing.Dict[str, typing.Union[BboxProcessor, KeypointsProcessor]] = {}
+ if bbox_params:
+ if isinstance(bbox_params, dict):
+ b_params = BboxParams(**bbox_params)
+ elif isinstance(bbox_params, BboxParams):
+ b_params = bbox_params
+ else:
+ raise ValueError("unknown format of bbox_params, please use `dict` or `BboxParams`")
+ self.processors["bboxes"] = BboxProcessor(b_params, additional_targets)
+
+ if keypoint_params:
+ if isinstance(keypoint_params, dict):
+ k_params = KeypointParams(**keypoint_params)
+ elif isinstance(keypoint_params, KeypointParams):
+ k_params = keypoint_params
+ else:
+ raise ValueError("unknown format of keypoint_params, please use `dict` or `KeypointParams`")
+ self.processors["keypoints"] = KeypointsProcessor(k_params, additional_targets)
+
+ if additional_targets is None:
+ additional_targets = {}
+
+ self.additional_targets = additional_targets
+
+ for proc in self.processors.values():
+ proc.ensure_transforms_valid(self.transforms)
+
+ self.add_targets(additional_targets)
+
+ self.is_check_args = True
+ self._disable_check_args_for_transforms(self.transforms)
+
+ self.is_check_shapes = is_check_shapes
+
+ @staticmethod
+ def _disable_check_args_for_transforms(transforms: TransformsSeqType) -> None:
+ for transform in transforms:
+ if isinstance(transform, BaseCompose):
+ Compose._disable_check_args_for_transforms(transform.transforms)
+ if isinstance(transform, Compose):
+ transform._disable_check_args()
+
+ def _disable_check_args(self) -> None:
+ self.is_check_args = False
+
+ def __call__(self, *args, force_apply: bool = False, **data) -> typing.Dict[str, typing.Any]:
+ if args:
+ raise KeyError("You have to pass data to augmentations as named arguments, for example: aug(image=image)")
+ if self.is_check_args:
+ self._check_args(**data)
+ assert isinstance(force_apply, (bool, int)), "force_apply must have bool or int type"
+ need_to_run = force_apply or random.random() < self.p
+ for p in self.processors.values():
+ p.ensure_data_valid(data)
+ transforms = self.transforms if need_to_run else get_always_apply(self.transforms)
+
+ check_each_transform = any(
+ getattr(item.params, "check_each_transform", False) for item in self.processors.values()
+ )
+
+ for p in self.processors.values():
+ p.preprocess(data)
+
+ for idx, t in enumerate(transforms):
+ data = t(**data)
+
+ if check_each_transform:
+ data = self._check_data_post_transform(data)
+ data = Compose._make_targets_contiguous(data) # ensure output targets are contiguous
+
+ for p in self.processors.values():
+ p.postprocess(data)
+
+ return data
+
+ def _check_data_post_transform(self, data: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]:
+ rows, cols = get_shape(data["image"])
+
+ for p in self.processors.values():
+ if not getattr(p.params, "check_each_transform", False):
+ continue
+
+ for data_name in p.data_fields:
+ data[data_name] = p.filter(data[data_name], rows, cols)
+ return data
+
+ def _to_dict(self) -> typing.Dict[str, typing.Any]:
+ dictionary = super(Compose, self)._to_dict()
+ bbox_processor = self.processors.get("bboxes")
+ keypoints_processor = self.processors.get("keypoints")
+ dictionary.update(
+ {
+ "bbox_params": bbox_processor.params._to_dict() if bbox_processor else None, # skipcq: PYL-W0212
+ "keypoint_params": keypoints_processor.params._to_dict() # skipcq: PYL-W0212
+ if keypoints_processor
+ else None,
+ "additional_targets": self.additional_targets,
+ "is_check_shapes": self.is_check_shapes,
+ }
+ )
+ return dictionary
+
+ def get_dict_with_id(self) -> typing.Dict[str, typing.Any]:
+ dictionary = super().get_dict_with_id()
+ bbox_processor = self.processors.get("bboxes")
+ keypoints_processor = self.processors.get("keypoints")
+ dictionary.update(
+ {
+ "bbox_params": bbox_processor.params._to_dict() if bbox_processor else None, # skipcq: PYL-W0212
+ "keypoint_params": keypoints_processor.params._to_dict() # skipcq: PYL-W0212
+ if keypoints_processor
+ else None,
+ "additional_targets": self.additional_targets,
+ "params": None,
+ "is_check_shapes": self.is_check_shapes,
+ }
+ )
+ return dictionary
+
+ def _check_args(self, **kwargs) -> None:
+ checked_single = ["image", "mask"]
+ checked_multi = ["masks"]
+ check_bbox_param = ["bboxes"]
+ # ["bboxes", "keypoints"] could be almost any type, no need to check them
+ shapes = []
+ for data_name, data in kwargs.items():
+ internal_data_name = self.additional_targets.get(data_name, data_name)
+ if internal_data_name in checked_single:
+ if not isinstance(data, np.ndarray):
+ raise TypeError("{} must be numpy array type".format(data_name))
+ shapes.append(data.shape[:2])
+ if internal_data_name in checked_multi:
+ if data is not None and len(data):
+ if not isinstance(data[0], np.ndarray):
+ raise TypeError("{} must be list of numpy arrays".format(data_name))
+ shapes.append(data[0].shape[:2])
+ if internal_data_name in check_bbox_param and self.processors.get("bboxes") is None:
+ raise ValueError("bbox_params must be specified for bbox transformations")
+
+ if self.is_check_shapes and shapes and shapes.count(shapes[0]) != len(shapes):
+ raise ValueError(
+ "Height and Width of image, mask or masks should be equal. You can disable shapes check "
+ "by setting a parameter is_check_shapes=False of Compose class (do it only if you are sure "
+ "about your data consistency)."
+ )
+
+ @staticmethod
+ def _make_targets_contiguous(data: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]:
+ result = {}
+ for key, value in data.items():
+ if isinstance(value, np.ndarray):
+ value = np.ascontiguousarray(value)
+ result[key] = value
+ return result
+
+
+class OneOf(BaseCompose):
+ """Select one of transforms to apply. Selected transform will be called with `force_apply=True`.
+ Transforms probabilities will be normalized to one 1, so in this case transforms probabilities works as weights.
+
+ Args:
+ transforms (list): list of transformations to compose.
+ p (float): probability of applying selected transform. Default: 0.5.
+ """
+
+ def __init__(self, transforms: TransformsSeqType, p: float = 0.5):
+ super(OneOf, self).__init__(transforms, p)
+ transforms_ps = [t.p for t in self.transforms]
+ s = sum(transforms_ps)
+ self.transforms_ps = [t / s for t in transforms_ps]
+
+ def __call__(self, *args, force_apply: bool = False, **data) -> typing.Dict[str, typing.Any]:
+ if self.replay_mode:
+ for t in self.transforms:
+ data = t(**data)
+ return data
+
+ if self.transforms_ps and (force_apply or random.random() < self.p):
+ idx: int = random_utils.choice(len(self.transforms), p=self.transforms_ps)
+ t = self.transforms[idx]
+ data = t(force_apply=True, **data)
+ return data
+
+
+class SomeOf(BaseCompose):
+ """Select N transforms to apply. Selected transforms will be called with `force_apply=True`.
+ Transforms probabilities will be normalized to one 1, so in this case transforms probabilities works as weights.
+
+ Args:
+ transforms (list): list of transformations to compose.
+ n (int): number of transforms to apply.
+ replace (bool): Whether the sampled transforms are with or without replacement. Default: True.
+ p (float): probability of applying selected transform. Default: 1.
+ """
+
+ def __init__(self, transforms: TransformsSeqType, n: int, replace: bool = True, p: float = 1):
+ super(SomeOf, self).__init__(transforms, p)
+ self.n = n
+ self.replace = replace
+ transforms_ps = [t.p for t in self.transforms]
+ s = sum(transforms_ps)
+ self.transforms_ps = [t / s for t in transforms_ps]
+
+ def __call__(self, *args, force_apply: bool = False, **data) -> typing.Dict[str, typing.Any]:
+ if self.replay_mode:
+ for t in self.transforms:
+ data = t(**data)
+ return data
+
+ if self.transforms_ps and (force_apply or random.random() < self.p):
+ idx = random_utils.choice(len(self.transforms), size=self.n, replace=self.replace, p=self.transforms_ps)
+ for i in idx: # type: ignore
+ t = self.transforms[i]
+ data = t(force_apply=True, **data)
+ return data
+
+ def _to_dict(self) -> typing.Dict[str, typing.Any]:
+ dictionary = super(SomeOf, self)._to_dict()
+ dictionary.update({"n": self.n, "replace": self.replace})
+ return dictionary
+
+
+class OneOrOther(BaseCompose):
+ """Select one or another transform to apply. Selected transform will be called with `force_apply=True`."""
+
+ def __init__(
+ self,
+ first: typing.Optional[TransformType] = None,
+ second: typing.Optional[TransformType] = None,
+ transforms: typing.Optional[TransformsSeqType] = None,
+ p: float = 0.5,
+ ):
+ if transforms is None:
+ if first is None or second is None:
+ raise ValueError("You must set both first and second or set transforms argument.")
+ transforms = [first, second]
+ super(OneOrOther, self).__init__(transforms, p)
+ if len(self.transforms) != 2:
+ warnings.warn("Length of transforms is not equal to 2.")
+
+ def __call__(self, *args, force_apply: bool = False, **data) -> typing.Dict[str, typing.Any]:
+ if self.replay_mode:
+ for t in self.transforms:
+ data = t(**data)
+ return data
+
+ if random.random() < self.p:
+ return self.transforms[0](force_apply=True, **data)
+
+ return self.transforms[-1](force_apply=True, **data)
+
+
+class PerChannel(BaseCompose):
+ """Apply transformations per-channel
+
+ Args:
+ transforms (list): list of transformations to compose.
+ channels (sequence): channels to apply the transform to. Pass None to apply to all.
+ Default: None (apply to all)
+ p (float): probability of applying the transform. Default: 0.5.
+ """
+
+ def __init__(
+ self, transforms: TransformsSeqType, channels: typing.Optional[typing.Sequence[int]] = None, p: float = 0.5
+ ):
+ super(PerChannel, self).__init__(transforms, p)
+ self.channels = channels
+
+ def __call__(self, *args, force_apply: bool = False, **data) -> typing.Dict[str, typing.Any]:
+ if force_apply or random.random() < self.p:
+ image = data["image"]
+
+ # Expand mono images to have a single channel
+ if len(image.shape) == 2:
+ image = np.expand_dims(image, -1)
+
+ if self.channels is None:
+ self.channels = range(image.shape[2])
+
+ for c in self.channels:
+ for t in self.transforms:
+ image[:, :, c] = t(image=image[:, :, c])["image"]
+
+ data["image"] = image
+
+ return data
+
+
+class ReplayCompose(Compose):
+ def __init__(
+ self,
+ transforms: TransformsSeqType,
+ bbox_params: typing.Optional[typing.Union[dict, "BboxParams"]] = None,
+ keypoint_params: typing.Optional[typing.Union[dict, "KeypointParams"]] = None,
+ additional_targets: typing.Optional[typing.Dict[str, str]] = None,
+ p: float = 1.0,
+ is_check_shapes: bool = True,
+ save_key: str = "replay",
+ ):
+ super(ReplayCompose, self).__init__(
+ transforms, bbox_params, keypoint_params, additional_targets, p, is_check_shapes
+ )
+ self.set_deterministic(True, save_key=save_key)
+ self.save_key = save_key
+
+ def __call__(self, *args, force_apply: bool = False, **kwargs) -> typing.Dict[str, typing.Any]:
+ kwargs[self.save_key] = defaultdict(dict)
+ result = super(ReplayCompose, self).__call__(force_apply=force_apply, **kwargs)
+ serialized = self.get_dict_with_id()
+ self.fill_with_params(serialized, result[self.save_key])
+ self.fill_applied(serialized)
+ result[self.save_key] = serialized
+ return result
+
+ @staticmethod
+ def replay(saved_augmentations: typing.Dict[str, typing.Any], **kwargs) -> typing.Dict[str, typing.Any]:
+ augs = ReplayCompose._restore_for_replay(saved_augmentations)
+ return augs(force_apply=True, **kwargs)
+
+ @staticmethod
+ def _restore_for_replay(
+ transform_dict: typing.Dict[str, typing.Any], lambda_transforms: typing.Optional[dict] = None
+ ) -> TransformType:
+ """
+ Args:
+ lambda_transforms (dict): A dictionary that contains lambda transforms, that
+ is instances of the Lambda class.
+ This dictionary is required when you are restoring a pipeline that contains lambda transforms. Keys
+ in that dictionary should be named same as `name` arguments in respective lambda transforms from
+ a serialized pipeline.
+ """
+ applied = transform_dict["applied"]
+ params = transform_dict["params"]
+ lmbd = instantiate_nonserializable(transform_dict, lambda_transforms)
+ if lmbd:
+ transform = lmbd
+ else:
+ name = transform_dict["__class_fullname__"]
+ args = {k: v for k, v in transform_dict.items() if k not in ["__class_fullname__", "applied", "params"]}
+ cls = SERIALIZABLE_REGISTRY[name]
+ if "transforms" in args:
+ args["transforms"] = [
+ ReplayCompose._restore_for_replay(t, lambda_transforms=lambda_transforms)
+ for t in args["transforms"]
+ ]
+ transform = cls(**args)
+
+ transform = typing.cast(BasicTransform, transform)
+ if isinstance(transform, BasicTransform):
+ transform.params = params
+ transform.replay_mode = True
+ transform.applied_in_replay = applied
+ return transform
+
+ def fill_with_params(self, serialized: dict, all_params: dict) -> None:
+ params = all_params.get(serialized.get("id"))
+ serialized["params"] = params
+ del serialized["id"]
+ for transform in serialized.get("transforms", []):
+ self.fill_with_params(transform, all_params)
+
+ def fill_applied(self, serialized: typing.Dict[str, typing.Any]) -> bool:
+ if "transforms" in serialized:
+ applied = [self.fill_applied(t) for t in serialized["transforms"]]
+ serialized["applied"] = any(applied)
+ else:
+ serialized["applied"] = serialized.get("params") is not None
+ return serialized["applied"]
+
+ def _to_dict(self) -> typing.Dict[str, typing.Any]:
+ dictionary = super(ReplayCompose, self)._to_dict()
+ dictionary.update({"save_key": self.save_key})
+ return dictionary
+
+
+class Sequential(BaseCompose):
+ """Sequentially applies all transforms to targets.
+
+ Note:
+ This transform is not intended to be a replacement for `Compose`. Instead, it should be used inside `Compose`
+ the same way `OneOf` or `OneOrOther` are used. For instance, you can combine `OneOf` with `Sequential` to
+ create an augmentation pipeline that contains multiple sequences of augmentations and applies one randomly
+ chose sequence to input data (see the `Example` section for an example definition of such pipeline).
+
+ Example:
+ >>> import custom_albumentations as albumentations as A
+ >>> transform = A.Compose([
+ >>> A.OneOf([
+ >>> A.Sequential([
+ >>> A.HorizontalFlip(p=0.5),
+ >>> A.ShiftScaleRotate(p=0.5),
+ >>> ]),
+ >>> A.Sequential([
+ >>> A.VerticalFlip(p=0.5),
+ >>> A.RandomBrightnessContrast(p=0.5),
+ >>> ]),
+ >>> ], p=1)
+ >>> ])
+ """
+
+ def __init__(self, transforms: TransformsSeqType, p: float = 0.5):
+ super().__init__(transforms, p)
+
+ def __call__(self, *args, **data) -> typing.Dict[str, typing.Any]:
+ for t in self.transforms:
+ data = t(**data)
+ return data
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/keypoints_utils.py b/comfyui_controlnet_aux/src/custom_albumentations/core/keypoints_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5308b062398a0a0314e26d198bf77f3b9416443
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/core/keypoints_utils.py
@@ -0,0 +1,286 @@
+from __future__ import division
+
+import math
+import typing
+import warnings
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+from .utils import DataProcessor, Params
+
+__all__ = [
+ "angle_to_2pi_range",
+ "check_keypoints",
+ "convert_keypoints_from_albumentations",
+ "convert_keypoints_to_albumentations",
+ "filter_keypoints",
+ "KeypointsProcessor",
+ "KeypointParams",
+]
+
+keypoint_formats = {"xy", "yx", "xya", "xys", "xyas", "xysa"}
+
+
+def angle_to_2pi_range(angle: float) -> float:
+ two_pi = 2 * math.pi
+ return angle % two_pi
+
+
+class KeypointParams(Params):
+ """
+ Parameters of keypoints
+
+ Args:
+ format (str): format of keypoints. Should be 'xy', 'yx', 'xya', 'xys', 'xyas', 'xysa'.
+
+ x - X coordinate,
+
+ y - Y coordinate
+
+ s - Keypoint scale
+
+ a - Keypoint orientation in radians or degrees (depending on KeypointParams.angle_in_degrees)
+ label_fields (list): list of fields that are joined with keypoints, e.g labels.
+ Should be same type as keypoints.
+ remove_invisible (bool): to remove invisible points after transform or not
+ angle_in_degrees (bool): angle in degrees or radians in 'xya', 'xyas', 'xysa' keypoints
+ check_each_transform (bool): if `True`, then keypoints will be checked after each dual transform.
+ Default: `True`
+ """
+
+ def __init__(
+ self,
+ format: str, # skipcq: PYL-W0622
+ label_fields: Optional[Sequence[str]] = None,
+ remove_invisible: bool = True,
+ angle_in_degrees: bool = True,
+ check_each_transform: bool = True,
+ ):
+ super(KeypointParams, self).__init__(format, label_fields)
+ self.remove_invisible = remove_invisible
+ self.angle_in_degrees = angle_in_degrees
+ self.check_each_transform = check_each_transform
+
+ def _to_dict(self) -> Dict[str, Any]:
+ data = super(KeypointParams, self)._to_dict()
+ data.update(
+ {
+ "remove_invisible": self.remove_invisible,
+ "angle_in_degrees": self.angle_in_degrees,
+ "check_each_transform": self.check_each_transform,
+ }
+ )
+ return data
+
+ @classmethod
+ def is_serializable(cls) -> bool:
+ return True
+
+ @classmethod
+ def get_class_fullname(cls) -> str:
+ return "KeypointParams"
+
+
+class KeypointsProcessor(DataProcessor):
+ def __init__(self, params: KeypointParams, additional_targets: Optional[Dict[str, str]] = None):
+ super().__init__(params, additional_targets)
+
+ @property
+ def default_data_name(self) -> str:
+ return "keypoints"
+
+ def ensure_data_valid(self, data: Dict[str, Any]) -> None:
+ if self.params.label_fields:
+ if not all(i in data.keys() for i in self.params.label_fields):
+ raise ValueError(
+ "Your 'label_fields' are not valid - them must have same names as params in "
+ "'keypoint_params' dict"
+ )
+
+ def ensure_transforms_valid(self, transforms: Sequence[object]) -> None:
+ # IAA-based augmentations supports only transformation of xy keypoints.
+ # If your keypoints formats is other than 'xy' we emit warning to let user
+ # be aware that angle and size will not be modified.
+
+ try:
+ from custom_albumentations.imgaug.transforms import DualIAATransform
+ except ImportError:
+ # imgaug is not installed so we skip imgaug checks.
+ return
+
+ if self.params.format is not None and self.params.format != "xy":
+ for transform in transforms:
+ if isinstance(transform, DualIAATransform):
+ warnings.warn(
+ "{} transformation supports only 'xy' keypoints "
+ "augmentation. You have '{}' keypoints format. Scale "
+ "and angle WILL NOT BE transformed.".format(transform.__class__.__name__, self.params.format)
+ )
+ break
+
+ def filter(self, data: Sequence[Sequence], rows: int, cols: int) -> Sequence[Sequence]:
+ self.params: KeypointParams
+ return filter_keypoints(data, rows, cols, remove_invisible=self.params.remove_invisible)
+
+ def check(self, data: Sequence[Sequence], rows: int, cols: int) -> None:
+ check_keypoints(data, rows, cols)
+
+ def convert_from_albumentations(self, data: Sequence[Sequence], rows: int, cols: int) -> List[Tuple]:
+ params = self.params
+ return convert_keypoints_from_albumentations(
+ data,
+ params.format,
+ rows,
+ cols,
+ check_validity=params.remove_invisible,
+ angle_in_degrees=params.angle_in_degrees,
+ )
+
+ def convert_to_albumentations(self, data: Sequence[Sequence], rows: int, cols: int) -> List[Tuple]:
+ params = self.params
+ return convert_keypoints_to_albumentations(
+ data,
+ params.format,
+ rows,
+ cols,
+ check_validity=params.remove_invisible,
+ angle_in_degrees=params.angle_in_degrees,
+ )
+
+
+def check_keypoint(kp: Sequence, rows: int, cols: int) -> None:
+ """Check if keypoint coordinates are less than image shapes"""
+ for name, value, size in zip(["x", "y"], kp[:2], [cols, rows]):
+ if not 0 <= value < size:
+ raise ValueError(
+ "Expected {name} for keypoint {kp} "
+ "to be in the range [0.0, {size}], got {value}.".format(kp=kp, name=name, value=value, size=size)
+ )
+
+ angle = kp[2]
+ if not (0 <= angle < 2 * math.pi):
+ raise ValueError("Keypoint angle must be in range [0, 2 * PI). Got: {angle}".format(angle=angle))
+
+
+def check_keypoints(keypoints: Sequence[Sequence], rows: int, cols: int) -> None:
+ """Check if keypoints boundaries are less than image shapes"""
+ for kp in keypoints:
+ check_keypoint(kp, rows, cols)
+
+
+def filter_keypoints(keypoints: Sequence[Sequence], rows: int, cols: int, remove_invisible: bool) -> Sequence[Sequence]:
+ if not remove_invisible:
+ return keypoints
+
+ resulting_keypoints = []
+ for kp in keypoints:
+ x, y = kp[:2]
+ if x < 0 or x >= cols:
+ continue
+ if y < 0 or y >= rows:
+ continue
+ resulting_keypoints.append(kp)
+ return resulting_keypoints
+
+
+def convert_keypoint_to_albumentations(
+ keypoint: Sequence,
+ source_format: str,
+ rows: int,
+ cols: int,
+ check_validity: bool = False,
+ angle_in_degrees: bool = True,
+) -> Tuple:
+ if source_format not in keypoint_formats:
+ raise ValueError("Unknown target_format {}. Supported formats are: {}".format(source_format, keypoint_formats))
+
+ if source_format == "xy":
+ (x, y), tail = keypoint[:2], tuple(keypoint[2:])
+ a, s = 0.0, 0.0
+ elif source_format == "yx":
+ (y, x), tail = keypoint[:2], tuple(keypoint[2:])
+ a, s = 0.0, 0.0
+ elif source_format == "xya":
+ (x, y, a), tail = keypoint[:3], tuple(keypoint[3:])
+ s = 0.0
+ elif source_format == "xys":
+ (x, y, s), tail = keypoint[:3], tuple(keypoint[3:])
+ a = 0.0
+ elif source_format == "xyas":
+ (x, y, a, s), tail = keypoint[:4], tuple(keypoint[4:])
+ elif source_format == "xysa":
+ (x, y, s, a), tail = keypoint[:4], tuple(keypoint[4:])
+ else:
+ raise ValueError(f"Unsupported source format. Got {source_format}")
+
+ if angle_in_degrees:
+ a = math.radians(a)
+
+ keypoint = (x, y, angle_to_2pi_range(a), s) + tail
+ if check_validity:
+ check_keypoint(keypoint, rows, cols)
+ return keypoint
+
+
+def convert_keypoint_from_albumentations(
+ keypoint: Sequence,
+ target_format: str,
+ rows: int,
+ cols: int,
+ check_validity: bool = False,
+ angle_in_degrees: bool = True,
+) -> Tuple:
+ if target_format not in keypoint_formats:
+ raise ValueError("Unknown target_format {}. Supported formats are: {}".format(target_format, keypoint_formats))
+
+ (x, y, angle, scale), tail = keypoint[:4], tuple(keypoint[4:])
+ angle = angle_to_2pi_range(angle)
+ if check_validity:
+ check_keypoint((x, y, angle, scale), rows, cols)
+ if angle_in_degrees:
+ angle = math.degrees(angle)
+
+ kp: Tuple
+ if target_format == "xy":
+ kp = (x, y)
+ elif target_format == "yx":
+ kp = (y, x)
+ elif target_format == "xya":
+ kp = (x, y, angle)
+ elif target_format == "xys":
+ kp = (x, y, scale)
+ elif target_format == "xyas":
+ kp = (x, y, angle, scale)
+ elif target_format == "xysa":
+ kp = (x, y, scale, angle)
+ else:
+ raise ValueError(f"Invalid target format. Got: {target_format}")
+
+ return kp + tail
+
+
+def convert_keypoints_to_albumentations(
+ keypoints: Sequence[Sequence],
+ source_format: str,
+ rows: int,
+ cols: int,
+ check_validity: bool = False,
+ angle_in_degrees: bool = True,
+) -> List[Tuple]:
+ return [
+ convert_keypoint_to_albumentations(kp, source_format, rows, cols, check_validity, angle_in_degrees)
+ for kp in keypoints
+ ]
+
+
+def convert_keypoints_from_albumentations(
+ keypoints: Sequence[Sequence],
+ target_format: str,
+ rows: int,
+ cols: int,
+ check_validity: bool = False,
+ angle_in_degrees: bool = True,
+) -> List[Tuple]:
+ return [
+ convert_keypoint_from_albumentations(kp, target_format, rows, cols, check_validity, angle_in_degrees)
+ for kp in keypoints
+ ]
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/serialization.py b/comfyui_controlnet_aux/src/custom_albumentations/core/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a127e8805d11a93297b2d75442f48d0221347
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/core/serialization.py
@@ -0,0 +1,247 @@
+from __future__ import absolute_import
+
+import json
+import typing
+import warnings
+from abc import ABC, ABCMeta, abstractmethod
+from typing import IO, Any, Callable, Dict, Optional, Tuple, Type, Union
+
+try:
+ import yaml
+
+ yaml_available = True
+except ImportError:
+ yaml_available = False
+
+
+from custom_albumentations import __version__
+
+__all__ = ["to_dict", "from_dict", "save", "load"]
+
+
+SERIALIZABLE_REGISTRY: Dict[str, "SerializableMeta"] = {}
+NON_SERIALIZABLE_REGISTRY: Dict[str, "SerializableMeta"] = {}
+
+
+def shorten_class_name(class_fullname: str) -> str:
+ splitted = class_fullname.split(".")
+ if len(splitted) == 1:
+ return class_fullname
+ top_module, *_, class_name = splitted
+ if top_module == "albumentations":
+ return class_name
+ return class_fullname
+
+
+def get_shortest_class_fullname(cls: Type) -> str:
+ class_fullname = "{cls.__module__}.{cls.__name__}".format(cls=cls)
+ return shorten_class_name(class_fullname)
+
+
+class SerializableMeta(ABCMeta):
+ """
+ A metaclass that is used to register classes in `SERIALIZABLE_REGISTRY` or `NON_SERIALIZABLE_REGISTRY`
+ so they can be found later while deserializing transformation pipeline using classes full names.
+ """
+
+ def __new__(mcs, name: str, bases: Tuple[type, ...], *args, **kwargs) -> "SerializableMeta":
+ cls_obj = super().__new__(mcs, name, bases, *args, **kwargs)
+ if name != "Serializable" and ABC not in bases:
+ if cls_obj.is_serializable():
+ SERIALIZABLE_REGISTRY[cls_obj.get_class_fullname()] = cls_obj
+ else:
+ NON_SERIALIZABLE_REGISTRY[cls_obj.get_class_fullname()] = cls_obj
+ return cls_obj
+
+ @classmethod
+ def is_serializable(mcs) -> bool:
+ return False
+
+ @classmethod
+ def get_class_fullname(mcs) -> str:
+ return get_shortest_class_fullname(mcs)
+
+ @classmethod
+ def _to_dict(mcs) -> Dict[str, Any]:
+ return {}
+
+
+class Serializable(metaclass=SerializableMeta):
+ @classmethod
+ @abstractmethod
+ def is_serializable(cls) -> bool:
+ raise NotImplementedError
+
+ @classmethod
+ @abstractmethod
+ def get_class_fullname(cls) -> str:
+ raise NotImplementedError
+
+ @abstractmethod
+ def _to_dict(self) -> Dict[str, Any]:
+ raise NotImplementedError
+
+ def to_dict(self, on_not_implemented_error: str = "raise") -> Dict[str, Any]:
+ """
+ Take a transform pipeline and convert it to a serializable representation that uses only standard
+ python data types: dictionaries, lists, strings, integers, and floats.
+
+ Args:
+ self: A transform that should be serialized. If the transform doesn't implement the `to_dict`
+ method and `on_not_implemented_error` equals to 'raise' then `NotImplementedError` is raised.
+ If `on_not_implemented_error` equals to 'warn' then `NotImplementedError` will be ignored
+ but no transform parameters will be serialized.
+ on_not_implemented_error (str): `raise` or `warn`.
+ """
+ if on_not_implemented_error not in {"raise", "warn"}:
+ raise ValueError(
+ "Unknown on_not_implemented_error value: {}. Supported values are: 'raise' and 'warn'".format(
+ on_not_implemented_error
+ )
+ )
+ try:
+ transform_dict = self._to_dict()
+ except NotImplementedError as e:
+ if on_not_implemented_error == "raise":
+ raise e
+
+ transform_dict = {}
+ warnings.warn(
+ "Got NotImplementedError while trying to serialize {obj}. Object arguments are not preserved. "
+ "Implement either '{cls_name}.get_transform_init_args_names' or '{cls_name}.get_transform_init_args' "
+ "method to make the transform serializable".format(obj=self, cls_name=self.__class__.__name__)
+ )
+ return {"__version__": __version__, "transform": transform_dict}
+
+
+def to_dict(transform: Serializable, on_not_implemented_error: str = "raise") -> Dict[str, Any]:
+ """
+ Take a transform pipeline and convert it to a serializable representation that uses only standard
+ python data types: dictionaries, lists, strings, integers, and floats.
+
+ Args:
+ transform: A transform that should be serialized. If the transform doesn't implement the `to_dict`
+ method and `on_not_implemented_error` equals to 'raise' then `NotImplementedError` is raised.
+ If `on_not_implemented_error` equals to 'warn' then `NotImplementedError` will be ignored
+ but no transform parameters will be serialized.
+ on_not_implemented_error (str): `raise` or `warn`.
+ """
+ return transform.to_dict(on_not_implemented_error)
+
+
+def instantiate_nonserializable(
+ transform: Dict[str, Any], nonserializable: Optional[Dict[str, Any]] = None
+) -> Optional[Serializable]:
+ if transform.get("__class_fullname__") in NON_SERIALIZABLE_REGISTRY:
+ name = transform["__name__"]
+ if nonserializable is None:
+ raise ValueError(
+ "To deserialize a non-serializable transform with name {name} you need to pass a dict with"
+ "this transform as the `lambda_transforms` argument".format(name=name)
+ )
+ result_transform = nonserializable.get(name)
+ if transform is None:
+ raise ValueError(
+ "Non-serializable transform with {name} was not found in `nonserializable`".format(name=name)
+ )
+ return result_transform
+ return None
+
+
+def from_dict(
+ transform_dict: Dict[str, Any],
+ nonserializable: Optional[Dict[str, Any]] = None,
+ lambda_transforms: Union[Optional[Dict[str, Any]], str] = "deprecated",
+) -> Optional[Serializable]:
+ """
+ Args:
+ transform_dict (dict): A dictionary with serialized transform pipeline.
+ nonserializable (dict): A dictionary that contains non-serializable transforms.
+ This dictionary is required when you are restoring a pipeline that contains non-serializable transforms.
+ Keys in that dictionary should be named same as `name` arguments in respective transforms from
+ a serialized pipeline.
+ lambda_transforms (dict): Deprecated. Use 'nonserizalizable' instead.
+ """
+ if lambda_transforms != "deprecated":
+ warnings.warn("lambda_transforms argument is deprecated, please use 'nonserializable'", DeprecationWarning)
+ nonserializable = typing.cast(Optional[Dict[str, Any]], lambda_transforms)
+
+ register_additional_transforms()
+ transform = transform_dict["transform"]
+ lmbd = instantiate_nonserializable(transform, nonserializable)
+ if lmbd:
+ return lmbd
+ name = transform["__class_fullname__"]
+ args = {k: v for k, v in transform.items() if k != "__class_fullname__"}
+ cls = SERIALIZABLE_REGISTRY[shorten_class_name(name)]
+ if "transforms" in args:
+ args["transforms"] = [from_dict({"transform": t}, nonserializable=nonserializable) for t in args["transforms"]]
+ return cls(**args)
+
+
+def check_data_format(data_format: str) -> None:
+ if data_format not in {"json", "yaml"}:
+ raise ValueError("Unknown data_format {}. Supported formats are: 'json' and 'yaml'".format(data_format))
+
+
+def save(
+ transform: Serializable, filepath: str, data_format: str = "json", on_not_implemented_error: str = "raise"
+) -> None:
+ """
+ Take a transform pipeline, serialize it and save a serialized version to a file
+ using either json or yaml format.
+
+ Args:
+ transform (obj): Transform to serialize.
+ filepath (str): Filepath to write to.
+ data_format (str): Serialization format. Should be either `json` or 'yaml'.
+ on_not_implemented_error (str): Parameter that describes what to do if a transform doesn't implement
+ the `to_dict` method. If 'raise' then `NotImplementedError` is raised, if `warn` then the exception will be
+ ignored and no transform arguments will be saved.
+ """
+ check_data_format(data_format)
+ transform_dict = transform.to_dict(on_not_implemented_error=on_not_implemented_error)
+ dump_fn = json.dump if data_format == "json" else yaml.safe_dump
+ with open(filepath, "w") as f:
+ dump_fn(transform_dict, f) # type: ignore
+
+
+def load(
+ filepath: str,
+ data_format: str = "json",
+ nonserializable: Optional[Dict[str, Any]] = None,
+ lambda_transforms: Union[Optional[Dict[str, Any]], str] = "deprecated",
+) -> object:
+ """
+ Load a serialized pipeline from a json or yaml file and construct a transform pipeline.
+
+ Args:
+ filepath (str): Filepath to read from.
+ data_format (str): Serialization format. Should be either `json` or 'yaml'.
+ nonserializable (dict): A dictionary that contains non-serializable transforms.
+ This dictionary is required when you are restoring a pipeline that contains non-serializable transforms.
+ Keys in that dictionary should be named same as `name` arguments in respective transforms from
+ a serialized pipeline.
+ lambda_transforms (dict): Deprecated. Use 'nonserizalizable' instead.
+ """
+ if lambda_transforms != "deprecated":
+ warnings.warn("lambda_transforms argument is deprecated, please use 'nonserializable'", DeprecationWarning)
+ nonserializable = typing.cast(Optional[Dict[str, Any]], lambda_transforms)
+
+ check_data_format(data_format)
+ load_fn = json.load if data_format == "json" else yaml.safe_load
+ with open(filepath) as f:
+ transform_dict = load_fn(f) # type: ignore
+
+ return from_dict(transform_dict, nonserializable=nonserializable)
+
+
+def register_additional_transforms() -> None:
+ """
+ Register transforms that are not imported directly into the `albumentations` module.
+ """
+ try:
+ # This import will result in ImportError if `torch` is not installed
+ import custom_albumentations.pytorch
+ except ImportError:
+ pass
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/transforms_interface.py b/comfyui_controlnet_aux/src/custom_albumentations/core/transforms_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5441b6041e4a70e0d5d0e8be378075d013c1943
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/core/transforms_interface.py
@@ -0,0 +1,293 @@
+from __future__ import absolute_import
+
+import random
+from copy import deepcopy
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
+from warnings import warn
+
+import cv2
+import numpy as np
+
+from .serialization import Serializable, get_shortest_class_fullname
+from .utils import format_args
+
+__all__ = [
+ "to_tuple",
+ "BasicTransform",
+ "DualTransform",
+ "ImageOnlyTransform",
+ "NoOp",
+ "BoxType",
+ "KeypointType",
+ "ImageColorType",
+ "ScaleFloatType",
+ "ScaleIntType",
+ "ImageColorType",
+]
+
+NumType = Union[int, float, np.ndarray]
+BoxInternalType = Tuple[float, float, float, float]
+BoxType = Union[BoxInternalType, Tuple[float, float, float, float, Any]]
+KeypointInternalType = Tuple[float, float, float, float]
+KeypointType = Union[KeypointInternalType, Tuple[float, float, float, float, Any]]
+ImageColorType = Union[float, Sequence[float]]
+
+ScaleFloatType = Union[float, Tuple[float, float]]
+ScaleIntType = Union[int, Tuple[int, int]]
+
+FillValueType = Optional[Union[int, float, Sequence[int], Sequence[float]]]
+
+
+def to_tuple(param, low=None, bias=None):
+ """Convert input argument to min-max tuple
+ Args:
+ param (scalar, tuple or list of 2+ elements): Input value.
+ If value is scalar, return value would be (offset - value, offset + value).
+ If value is tuple, return value would be value + offset (broadcasted).
+ low: Second element of tuple can be passed as optional argument
+ bias: An offset factor added to each element
+ """
+ if low is not None and bias is not None:
+ raise ValueError("Arguments low and bias are mutually exclusive")
+
+ if param is None:
+ return param
+
+ if isinstance(param, (int, float)):
+ if low is None:
+ param = -param, +param
+ else:
+ param = (low, param) if low < param else (param, low)
+ elif isinstance(param, Sequence):
+ if len(param) != 2:
+ raise ValueError("to_tuple expects 1 or 2 values")
+ param = tuple(param)
+ else:
+ raise ValueError("Argument param must be either scalar (int, float) or tuple")
+
+ if bias is not None:
+ return tuple(bias + x for x in param)
+
+ return tuple(param)
+
+
+class BasicTransform(Serializable):
+ call_backup = None
+ interpolation: Any
+ fill_value: Any
+ mask_fill_value: Any
+
+ def __init__(self, always_apply: bool = False, p: float = 0.5):
+ self.p = p
+ self.always_apply = always_apply
+ self._additional_targets: Dict[str, str] = {}
+
+ # replay mode params
+ self.deterministic = False
+ self.save_key = "replay"
+ self.params: Dict[Any, Any] = {}
+ self.replay_mode = False
+ self.applied_in_replay = False
+
+ def __call__(self, *args, force_apply: bool = False, **kwargs) -> Dict[str, Any]:
+ if args:
+ raise KeyError("You have to pass data to augmentations as named arguments, for example: aug(image=image)")
+ if self.replay_mode:
+ if self.applied_in_replay:
+ return self.apply_with_params(self.params, **kwargs)
+
+ return kwargs
+
+ if (random.random() < self.p) or self.always_apply or force_apply:
+ params = self.get_params()
+
+ if self.targets_as_params:
+ assert all(key in kwargs for key in self.targets_as_params), "{} requires {}".format(
+ self.__class__.__name__, self.targets_as_params
+ )
+ targets_as_params = {k: kwargs[k] for k in self.targets_as_params}
+ params_dependent_on_targets = self.get_params_dependent_on_targets(targets_as_params)
+ params.update(params_dependent_on_targets)
+ if self.deterministic:
+ if self.targets_as_params:
+ warn(
+ self.get_class_fullname() + " could work incorrectly in ReplayMode for other input data"
+ " because its' params depend on targets."
+ )
+ kwargs[self.save_key][id(self)] = deepcopy(params)
+ return self.apply_with_params(params, **kwargs)
+
+ return kwargs
+
+ def apply_with_params(self, params: Dict[str, Any], **kwargs) -> Dict[str, Any]: # skipcq: PYL-W0613
+ if params is None:
+ return kwargs
+ params = self.update_params(params, **kwargs)
+ res = {}
+ for key, arg in kwargs.items():
+ if arg is not None:
+ target_function = self._get_target_function(key)
+ target_dependencies = {k: kwargs[k] for k in self.target_dependence.get(key, [])}
+ res[key] = target_function(arg, **dict(params, **target_dependencies))
+ else:
+ res[key] = None
+ return res
+
+ def set_deterministic(self, flag: bool, save_key: str = "replay") -> "BasicTransform":
+ assert save_key != "params", "params save_key is reserved"
+ self.deterministic = flag
+ self.save_key = save_key
+ return self
+
+ def __repr__(self) -> str:
+ state = self.get_base_init_args()
+ state.update(self.get_transform_init_args())
+ return "{name}({args})".format(name=self.__class__.__name__, args=format_args(state))
+
+ def _get_target_function(self, key: str) -> Callable:
+ transform_key = key
+ if key in self._additional_targets:
+ transform_key = self._additional_targets.get(key, key)
+
+ target_function = self.targets.get(transform_key, lambda x, **p: x)
+ return target_function
+
+ def apply(self, img: np.ndarray, **params) -> np.ndarray:
+ raise NotImplementedError
+
+ def get_params(self) -> Dict:
+ return {}
+
+ @property
+ def targets(self) -> Dict[str, Callable]:
+ # you must specify targets in subclass
+ # for example: ('image', 'mask')
+ # ('image', 'boxes')
+ raise NotImplementedError
+
+ def update_params(self, params: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+ if hasattr(self, "interpolation"):
+ params["interpolation"] = self.interpolation
+ if hasattr(self, "fill_value"):
+ params["fill_value"] = self.fill_value
+ if hasattr(self, "mask_fill_value"):
+ params["mask_fill_value"] = self.mask_fill_value
+ params.update({"cols": kwargs["image"].shape[1], "rows": kwargs["image"].shape[0]})
+ return params
+
+ @property
+ def target_dependence(self) -> Dict:
+ return {}
+
+ def add_targets(self, additional_targets: Dict[str, str]):
+ """Add targets to transform them the same way as one of existing targets
+ ex: {'target_image': 'image'}
+ ex: {'obj1_mask': 'mask', 'obj2_mask': 'mask'}
+ by the way you must have at least one object with key 'image'
+
+ Args:
+ additional_targets (dict): keys - new target name, values - old target name. ex: {'image2': 'image'}
+ """
+ self._additional_targets = additional_targets
+
+ @property
+ def targets_as_params(self) -> List[str]:
+ return []
+
+ def get_params_dependent_on_targets(self, params: Dict[str, Any]) -> Dict[str, Any]:
+ raise NotImplementedError(
+ "Method get_params_dependent_on_targets is not implemented in class " + self.__class__.__name__
+ )
+
+ @classmethod
+ def get_class_fullname(cls) -> str:
+ return get_shortest_class_fullname(cls)
+
+ @classmethod
+ def is_serializable(cls):
+ return True
+
+ def get_transform_init_args_names(self) -> Tuple[str, ...]:
+ raise NotImplementedError(
+ "Class {name} is not serializable because the `get_transform_init_args_names` method is not "
+ "implemented".format(name=self.get_class_fullname())
+ )
+
+ def get_base_init_args(self) -> Dict[str, Any]:
+ return {"always_apply": self.always_apply, "p": self.p}
+
+ def get_transform_init_args(self) -> Dict[str, Any]:
+ return {k: getattr(self, k) for k in self.get_transform_init_args_names()}
+
+ def _to_dict(self) -> Dict[str, Any]:
+ state = {"__class_fullname__": self.get_class_fullname()}
+ state.update(self.get_base_init_args())
+ state.update(self.get_transform_init_args())
+ return state
+
+ def get_dict_with_id(self) -> Dict[str, Any]:
+ d = self._to_dict()
+ d["id"] = id(self)
+ return d
+
+
+class DualTransform(BasicTransform):
+ """Transform for segmentation task."""
+
+ @property
+ def targets(self) -> Dict[str, Callable]:
+ return {
+ "image": self.apply,
+ "mask": self.apply_to_mask,
+ "masks": self.apply_to_masks,
+ "bboxes": self.apply_to_bboxes,
+ "keypoints": self.apply_to_keypoints,
+ }
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ raise NotImplementedError("Method apply_to_bbox is not implemented in class " + self.__class__.__name__)
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, **params) -> KeypointInternalType:
+ raise NotImplementedError("Method apply_to_keypoint is not implemented in class " + self.__class__.__name__)
+
+ def apply_to_bboxes(self, bboxes: Sequence[BoxType], **params) -> List[BoxType]:
+ return [self.apply_to_bbox(tuple(bbox[:4]), **params) + tuple(bbox[4:]) for bbox in bboxes] # type: ignore
+
+ def apply_to_keypoints(self, keypoints: Sequence[KeypointType], **params) -> List[KeypointType]:
+ return [ # type: ignore
+ self.apply_to_keypoint(tuple(keypoint[:4]), **params) + tuple(keypoint[4:]) # type: ignore
+ for keypoint in keypoints
+ ]
+
+ def apply_to_mask(self, img: np.ndarray, **params) -> np.ndarray:
+ return self.apply(img, **{k: cv2.INTER_NEAREST if k == "interpolation" else v for k, v in params.items()})
+
+ def apply_to_masks(self, masks: Sequence[np.ndarray], **params) -> List[np.ndarray]:
+ return [self.apply_to_mask(mask, **params) for mask in masks]
+
+
+class ImageOnlyTransform(BasicTransform):
+ """Transform applied to image only."""
+
+ @property
+ def targets(self) -> Dict[str, Callable]:
+ return {"image": self.apply}
+
+
+class NoOp(DualTransform):
+ """Does nothing"""
+
+ def apply_to_keypoint(self, keypoint: KeypointInternalType, **params) -> KeypointInternalType:
+ return keypoint
+
+ def apply_to_bbox(self, bbox: BoxInternalType, **params) -> BoxInternalType:
+ return bbox
+
+ def apply(self, img: np.ndarray, **params) -> np.ndarray:
+ return img
+
+ def apply_to_mask(self, img: np.ndarray, **params) -> np.ndarray:
+ return img
+
+ def get_transform_init_args_names(self) -> Tuple:
+ return ()
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/core/utils.py b/comfyui_controlnet_aux/src/custom_albumentations/core/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c41ae5d41cf08b9825c67687511b8aab7ea1bb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/core/utils.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Sequence, Tuple
+
+import numpy as np
+
+from .serialization import Serializable
+
+
+def get_shape(img: Any) -> Tuple[int, int]:
+ if isinstance(img, np.ndarray):
+ rows, cols = img.shape[:2]
+ return rows, cols
+
+ try:
+ import torch
+
+ if torch.is_tensor(img):
+ rows, cols = img.shape[-2:]
+ return rows, cols
+ except ImportError:
+ pass
+
+ raise RuntimeError(
+ f"Albumentations supports only numpy.ndarray and torch.Tensor data type for image. Got: {type(img)}"
+ )
+
+
+def format_args(args_dict: Dict):
+ formatted_args = []
+ for k, v in args_dict.items():
+ if isinstance(v, str):
+ v = f"'{v}'"
+ formatted_args.append(f"{k}={v}")
+ return ", ".join(formatted_args)
+
+
+class Params(Serializable, ABC):
+ def __init__(self, format: str, label_fields: Optional[Sequence[str]] = None):
+ self.format = format
+ self.label_fields = label_fields
+
+ def _to_dict(self) -> Dict[str, Any]:
+ return {"format": self.format, "label_fields": self.label_fields}
+
+
+class DataProcessor(ABC):
+ def __init__(self, params: Params, additional_targets: Optional[Dict[str, str]] = None):
+ self.params = params
+ self.data_fields = [self.default_data_name]
+ if additional_targets is not None:
+ for k, v in additional_targets.items():
+ if v == self.default_data_name:
+ self.data_fields.append(k)
+
+ @property
+ @abstractmethod
+ def default_data_name(self) -> str:
+ raise NotImplementedError
+
+ def ensure_data_valid(self, data: Dict[str, Any]) -> None:
+ pass
+
+ def ensure_transforms_valid(self, transforms: Sequence[object]) -> None:
+ pass
+
+ def postprocess(self, data: Dict[str, Any]) -> Dict[str, Any]:
+ rows, cols = get_shape(data["image"])
+
+ for data_name in self.data_fields:
+ data[data_name] = self.filter(data[data_name], rows, cols)
+ data[data_name] = self.check_and_convert(data[data_name], rows, cols, direction="from")
+
+ data = self.remove_label_fields_from_data(data)
+ return data
+
+ def preprocess(self, data: Dict[str, Any]) -> None:
+ data = self.add_label_fields_to_data(data)
+
+ rows, cols = data["image"].shape[:2]
+ for data_name in self.data_fields:
+ data[data_name] = self.check_and_convert(data[data_name], rows, cols, direction="to")
+
+ def check_and_convert(self, data: Sequence, rows: int, cols: int, direction: str = "to") -> Sequence:
+ if self.params.format == "albumentations":
+ self.check(data, rows, cols)
+ return data
+
+ if direction == "to":
+ return self.convert_to_albumentations(data, rows, cols)
+ elif direction == "from":
+ return self.convert_from_albumentations(data, rows, cols)
+ else:
+ raise ValueError(f"Invalid direction. Must be `to` or `from`. Got `{direction}`")
+
+ @abstractmethod
+ def filter(self, data: Sequence, rows: int, cols: int) -> Sequence:
+ pass
+
+ @abstractmethod
+ def check(self, data: Sequence, rows: int, cols: int) -> None:
+ pass
+
+ @abstractmethod
+ def convert_to_albumentations(self, data: Sequence, rows: int, cols: int) -> Sequence:
+ pass
+
+ @abstractmethod
+ def convert_from_albumentations(self, data: Sequence, rows: int, cols: int) -> Sequence:
+ pass
+
+ def add_label_fields_to_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+ if self.params.label_fields is None:
+ return data
+ for data_name in self.data_fields:
+ for field in self.params.label_fields:
+ assert len(data[data_name]) == len(data[field])
+ data_with_added_field = []
+ for d, field_value in zip(data[data_name], data[field]):
+ data_with_added_field.append(list(d) + [field_value])
+ data[data_name] = data_with_added_field
+ return data
+
+ def remove_label_fields_from_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+ if self.params.label_fields is None:
+ return data
+ for data_name in self.data_fields:
+ label_fields_len = len(self.params.label_fields)
+ for idx, field in enumerate(self.params.label_fields):
+ field_values = []
+ for bbox in data[data_name]:
+ field_values.append(bbox[-label_fields_len + idx])
+ data[field] = field_values
+ if label_fields_len:
+ data[data_name] = [d[:-label_fields_len] for d in data[data_name]]
+ return data
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/imgaug/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/imgaug/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/imgaug/stubs.py b/comfyui_controlnet_aux/src/custom_albumentations/imgaug/stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..276a9e48a5ba39f579ea4da4ab463e0b054ac8e4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/imgaug/stubs.py
@@ -0,0 +1,77 @@
+__all__ = [
+ "IAAEmboss",
+ "IAASuperpixels",
+ "IAASharpen",
+ "IAAAdditiveGaussianNoise",
+ "IAACropAndPad",
+ "IAAFliplr",
+ "IAAFlipud",
+ "IAAAffine",
+ "IAAPiecewiseAffine",
+ "IAAPerspective",
+]
+
+
+class IAAStub:
+ def __init__(self, *args, **kwargs):
+ cls_name = self.__class__.__name__
+ doc_link = "https://albumentations.ai/docs/api_reference/augmentations" + self.doc_link
+ raise RuntimeError(
+ f"You are trying to use a deprecated augmentation '{cls_name}' which depends on the imgaug library, "
+ f"but imgaug is not installed.\n\n"
+ "There are two options to fix this error:\n"
+ "1. [Recommended]. Switch to the Albumentations' implementation of the augmentation with the same API: "
+ f"{self.alternative} - {doc_link}\n"
+ "2. Install a version of Albumentations that contains imgaug by running "
+ "'pip install -U albumentations[imgaug]'."
+ )
+
+
+class IAACropAndPad(IAAStub):
+ alternative = "CropAndPad"
+ doc_link = "/crops/transforms/#albumentations.augmentations.crops.transforms.CropAndPad"
+
+
+class IAAFliplr(IAAStub):
+ alternative = "HorizontalFlip"
+ doc_link = "/transforms/#albumentations.augmentations.transforms.HorizontalFlip"
+
+
+class IAAFlipud(IAAStub):
+ alternative = "VerticalFlip"
+ doc_link = "/transforms/#albumentations.augmentations.transforms.VerticalFlip"
+
+
+class IAAEmboss(IAAStub):
+ alternative = "Emboss"
+ doc_link = "/transforms/#albumentations.augmentations.transforms.Emboss"
+
+
+class IAASuperpixels(IAAStub):
+ alternative = "Superpixels"
+ doc_link = "/transforms/#albumentations.augmentations.transforms.Superpixels"
+
+
+class IAASharpen(IAAStub):
+ alternative = "Sharpen"
+ doc_link = "/transforms/#albumentations.augmentations.transforms.Sharpen"
+
+
+class IAAAdditiveGaussianNoise(IAAStub):
+ alternative = "GaussNoise"
+ doc_link = "/transforms/#albumentations.augmentations.transforms.GaussNoise"
+
+
+class IAAPiecewiseAffine(IAAStub):
+ alternative = "PiecewiseAffine"
+ doc_link = "/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine"
+
+
+class IAAAffine(IAAStub):
+ alternative = "Affine"
+ doc_link = "/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine"
+
+
+class IAAPerspective(IAAStub):
+ alternative = "Perspective"
+ doc_link = "/geometric/transforms/#albumentations.augmentations.geometric.transforms.Perspective"
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/imgaug/transforms.py b/comfyui_controlnet_aux/src/custom_albumentations/imgaug/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f9f26bc40994cd58c29e3467e96e377a39f5a4e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/imgaug/transforms.py
@@ -0,0 +1,391 @@
+try:
+ import imgaug as ia
+except ImportError as e:
+ raise ImportError(
+ "You are trying to import an augmentation that depends on the imgaug library, but imgaug is not installed. To "
+ "install a version of Albumentations that contains imgaug please run 'pip install -U albumentations[imgaug]'"
+ ) from e
+
+try:
+ from imgaug import augmenters as iaa
+except ImportError:
+ import imgaug.imgaug.augmenters as iaa
+
+import warnings
+
+from custom_albumentations.core.bbox_utils import (
+ convert_bboxes_from_albumentations,
+ convert_bboxes_to_albumentations,
+)
+from custom_albumentations.core.keypoints_utils import (
+ convert_keypoints_from_albumentations,
+ convert_keypoints_to_albumentations,
+)
+
+from ..augmentations import Perspective
+from ..core.transforms_interface import (
+ BasicTransform,
+ DualTransform,
+ ImageOnlyTransform,
+ to_tuple,
+)
+
+__all__ = [
+ "BasicIAATransform",
+ "DualIAATransform",
+ "ImageOnlyIAATransform",
+ "IAAEmboss",
+ "IAASuperpixels",
+ "IAASharpen",
+ "IAAAdditiveGaussianNoise",
+ "IAACropAndPad",
+ "IAAFliplr",
+ "IAAFlipud",
+ "IAAAffine",
+ "IAAPiecewiseAffine",
+ "IAAPerspective",
+]
+
+
+class BasicIAATransform(BasicTransform):
+ def __init__(self, always_apply=False, p=0.5):
+ super(BasicIAATransform, self).__init__(always_apply, p)
+
+ @property
+ def processor(self):
+ return iaa.Noop()
+
+ def update_params(self, params, **kwargs):
+ params = super(BasicIAATransform, self).update_params(params, **kwargs)
+ params["deterministic_processor"] = self.processor.to_deterministic()
+ return params
+
+ def apply(self, img, deterministic_processor=None, **params):
+ return deterministic_processor.augment_image(img)
+
+
+class DualIAATransform(DualTransform, BasicIAATransform):
+ def apply_to_bboxes(self, bboxes, deterministic_processor=None, rows=0, cols=0, **params):
+ if len(bboxes) > 0:
+ bboxes = convert_bboxes_from_albumentations(bboxes, "pascal_voc", rows=rows, cols=cols)
+
+ bboxes_t = ia.BoundingBoxesOnImage([ia.BoundingBox(*bbox[:4]) for bbox in bboxes], (rows, cols))
+ bboxes_t = deterministic_processor.augment_bounding_boxes([bboxes_t])[0].bounding_boxes
+ bboxes_t = [
+ [bbox.x1, bbox.y1, bbox.x2, bbox.y2] + list(bbox_orig[4:])
+ for (bbox, bbox_orig) in zip(bboxes_t, bboxes)
+ ]
+
+ bboxes = convert_bboxes_to_albumentations(bboxes_t, "pascal_voc", rows=rows, cols=cols)
+ return bboxes
+
+ """Applies transformation to keypoints.
+ Notes:
+ Since IAA supports only xy keypoints, scale and orientation will remain unchanged.
+ TODO:
+ Emit a warning message if child classes of DualIAATransform are instantiated
+ inside Compose with keypoints format other than 'xy'.
+ """
+
+ def apply_to_keypoints(self, keypoints, deterministic_processor=None, rows=0, cols=0, **params):
+ if len(keypoints) > 0:
+ keypoints = convert_keypoints_from_albumentations(keypoints, "xy", rows=rows, cols=cols)
+ keypoints_t = ia.KeypointsOnImage([ia.Keypoint(*kp[:2]) for kp in keypoints], (rows, cols))
+ keypoints_t = deterministic_processor.augment_keypoints([keypoints_t])[0].keypoints
+
+ bboxes_t = [[kp.x, kp.y] + list(kp_orig[2:]) for (kp, kp_orig) in zip(keypoints_t, keypoints)]
+
+ keypoints = convert_keypoints_to_albumentations(bboxes_t, "xy", rows=rows, cols=cols)
+ return keypoints
+
+
+class ImageOnlyIAATransform(ImageOnlyTransform, BasicIAATransform):
+ pass
+
+
+class IAACropAndPad(DualIAATransform):
+ """This augmentation is deprecated. Please use CropAndPad instead."""
+
+ def __init__(self, px=None, percent=None, pad_mode="constant", pad_cval=0, keep_size=True, always_apply=False, p=1):
+ super(IAACropAndPad, self).__init__(always_apply, p)
+ self.px = px
+ self.percent = percent
+ self.pad_mode = pad_mode
+ self.pad_cval = pad_cval
+ self.keep_size = keep_size
+ warnings.warn("IAACropAndPad is deprecated. Please use CropAndPad instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.CropAndPad(self.px, self.percent, self.pad_mode, self.pad_cval, self.keep_size)
+
+ def get_transform_init_args_names(self):
+ return ("px", "percent", "pad_mode", "pad_cval", "keep_size")
+
+
+class IAAFliplr(DualIAATransform):
+ """This augmentation is deprecated. Please use HorizontalFlip instead."""
+
+ def __init__(self, always_apply=False, p=0.5):
+ super().__init__(always_apply, p)
+ warnings.warn("IAAFliplr is deprecated. Please use HorizontalFlip instead.", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.Fliplr(1)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class IAAFlipud(DualIAATransform):
+ """This augmentation is deprecated. Please use VerticalFlip instead."""
+
+ def __init__(self, always_apply=False, p=0.5):
+ super().__init__(always_apply, p)
+ warnings.warn("IAAFlipud is deprecated. Please use VerticalFlip instead.", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.Flipud(1)
+
+ def get_transform_init_args_names(self):
+ return ()
+
+
+class IAAEmboss(ImageOnlyIAATransform):
+ """Emboss the input image and overlays the result with the original image.
+ This augmentation is deprecated. Please use Emboss instead.
+
+ Args:
+ alpha ((float, float)): range to choose the visibility of the embossed image. At 0, only the original image is
+ visible,at 1.0 only its embossed version is visible. Default: (0.2, 0.5).
+ strength ((float, float)): strength range of the embossing. Default: (0.2, 0.7).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(self, alpha=(0.2, 0.5), strength=(0.2, 0.7), always_apply=False, p=0.5):
+ super(IAAEmboss, self).__init__(always_apply, p)
+ self.alpha = to_tuple(alpha, 0.0)
+ self.strength = to_tuple(strength, 0.0)
+ warnings.warn("This augmentation is deprecated. Please use Emboss instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.Emboss(self.alpha, self.strength)
+
+ def get_transform_init_args_names(self):
+ return ("alpha", "strength")
+
+
+class IAASuperpixels(ImageOnlyIAATransform):
+ """Completely or partially transform the input image to its superpixel representation. Uses skimage's version
+ of the SLIC algorithm. May be slow.
+
+ This augmentation is deprecated. Please use Superpixels instead.
+
+ Args:
+ p_replace (float): defines the probability of any superpixel area being replaced by the superpixel, i.e. by
+ the average pixel color within its area. Default: 0.1.
+ n_segments (int): target number of superpixels to generate. Default: 100.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(self, p_replace=0.1, n_segments=100, always_apply=False, p=0.5):
+ super(IAASuperpixels, self).__init__(always_apply, p)
+ self.p_replace = p_replace
+ self.n_segments = n_segments
+ warnings.warn("IAASuperpixels is deprecated. Please use Superpixels instead.", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.Superpixels(p_replace=self.p_replace, n_segments=self.n_segments)
+
+ def get_transform_init_args_names(self):
+ return ("p_replace", "n_segments")
+
+
+class IAASharpen(ImageOnlyIAATransform):
+ """Sharpen the input image and overlays the result with the original image.
+ This augmentation is deprecated. Please use Sharpen instead
+ Args:
+ alpha ((float, float)): range to choose the visibility of the sharpened image. At 0, only the original image is
+ visible, at 1.0 only its sharpened version is visible. Default: (0.2, 0.5).
+ lightness ((float, float)): range to choose the lightness of the sharpened image. Default: (0.5, 1.0).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(self, alpha=(0.2, 0.5), lightness=(0.5, 1.0), always_apply=False, p=0.5):
+ super(IAASharpen, self).__init__(always_apply, p)
+ self.alpha = to_tuple(alpha, 0)
+ self.lightness = to_tuple(lightness, 0)
+ warnings.warn("IAASharpen is deprecated. Please use Sharpen instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.Sharpen(self.alpha, self.lightness)
+
+ def get_transform_init_args_names(self):
+ return ("alpha", "lightness")
+
+
+class IAAAdditiveGaussianNoise(ImageOnlyIAATransform):
+ """Add gaussian noise to the input image.
+
+ This augmentation is deprecated. Please use GaussNoise instead.
+
+ Args:
+ loc (int): mean of the normal distribution that generates the noise. Default: 0.
+ scale ((float, float)): standard deviation of the normal distribution that generates the noise.
+ Default: (0.01 * 255, 0.05 * 255).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image
+ """
+
+ def __init__(self, loc=0, scale=(0.01 * 255, 0.05 * 255), per_channel=False, always_apply=False, p=0.5):
+ super(IAAAdditiveGaussianNoise, self).__init__(always_apply, p)
+ self.loc = loc
+ self.scale = to_tuple(scale, 0.0)
+ self.per_channel = per_channel
+ warnings.warn("IAAAdditiveGaussianNoise is deprecated. Please use GaussNoise instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.AdditiveGaussianNoise(self.loc, self.scale, self.per_channel)
+
+ def get_transform_init_args_names(self):
+ return ("loc", "scale", "per_channel")
+
+
+class IAAPiecewiseAffine(DualIAATransform):
+ """Place a regular grid of points on the input and randomly move the neighbourhood of these point around
+ via affine transformations.
+
+ This augmentation is deprecated. Please use PiecewiseAffine instead.
+
+ Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+ Args:
+ scale ((float, float): factor range that determines how far each point is moved. Default: (0.03, 0.05).
+ nb_rows (int): number of rows of points that the regular grid should have. Default: 4.
+ nb_cols (int): number of columns of points that the regular grid should have. Default: 4.
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask
+ """
+
+ def __init__(
+ self, scale=(0.03, 0.05), nb_rows=4, nb_cols=4, order=1, cval=0, mode="constant", always_apply=False, p=0.5
+ ):
+ super(IAAPiecewiseAffine, self).__init__(always_apply, p)
+ self.scale = to_tuple(scale, 0.0)
+ self.nb_rows = nb_rows
+ self.nb_cols = nb_cols
+ self.order = order
+ self.cval = cval
+ self.mode = mode
+ warnings.warn("This IAAPiecewiseAffine is deprecated. Please use PiecewiseAffine instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.PiecewiseAffine(self.scale, self.nb_rows, self.nb_cols, self.order, self.cval, self.mode)
+
+ def get_transform_init_args_names(self):
+ return ("scale", "nb_rows", "nb_cols", "order", "cval", "mode")
+
+
+class IAAAffine(DualIAATransform):
+ """Place a regular grid of points on the input and randomly move the neighbourhood of these point around
+ via affine transformations.
+
+ This augmentation is deprecated. Please use Affine instead.
+
+ Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+ Args:
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask
+ """
+
+ def __init__(
+ self,
+ scale=1.0,
+ translate_percent=None,
+ translate_px=None,
+ rotate=0.0,
+ shear=0.0,
+ order=1,
+ cval=0,
+ mode="reflect",
+ always_apply=False,
+ p=0.5,
+ ):
+ super(IAAAffine, self).__init__(always_apply, p)
+ self.scale = to_tuple(scale, 1.0)
+ self.translate_percent = to_tuple(translate_percent, 0)
+ self.translate_px = to_tuple(translate_px, 0)
+ self.rotate = to_tuple(rotate)
+ self.shear = to_tuple(shear)
+ self.order = order
+ self.cval = cval
+ self.mode = mode
+ warnings.warn("This IAAAffine is deprecated. Please use Affine instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.Affine(
+ self.scale,
+ self.translate_percent,
+ self.translate_px,
+ self.rotate,
+ self.shear,
+ self.order,
+ self.cval,
+ self.mode,
+ )
+
+ def get_transform_init_args_names(self):
+ return ("scale", "translate_percent", "translate_px", "rotate", "shear", "order", "cval", "mode")
+
+
+class IAAPerspective(Perspective):
+ """Perform a random four point perspective transform of the input.
+ This augmentation is deprecated. Please use Perspective instead.
+
+ Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+ Args:
+ scale ((float, float): standard deviation of the normal distributions. These are used to sample
+ the random distances of the subimage's corners from the full image's corners. Default: (0.05, 0.1).
+ p (float): probability of applying the transform. Default: 0.5.
+
+ Targets:
+ image, mask
+ """
+
+ def __init__(self, scale=(0.05, 0.1), keep_size=True, always_apply=False, p=0.5):
+ super(IAAPerspective, self).__init__(always_apply, p)
+ self.scale = to_tuple(scale, 1.0)
+ self.keep_size = keep_size
+ warnings.warn("This IAAPerspective is deprecated. Please use Perspective instead", FutureWarning)
+
+ @property
+ def processor(self):
+ return iaa.PerspectiveTransform(self.scale, keep_size=self.keep_size)
+
+ def get_transform_init_args_names(self):
+ return ("scale", "keep_size")
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/pytorch/__init__.py b/comfyui_controlnet_aux/src/custom_albumentations/pytorch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e004a56bfe2015d5345fb46fa3441014ecbe1f4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/pytorch/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import absolute_import
+
+from .transforms import *
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/pytorch/functional.py b/comfyui_controlnet_aux/src/custom_albumentations/pytorch/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91a7046aefbd764c69af4d1de98ec3c7d260359
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/pytorch/functional.py
@@ -0,0 +1,31 @@
+from __future__ import division
+
+import numpy as np
+import torch
+import torchvision.transforms.functional as F
+
+
+def img_to_tensor(im, normalize=None):
+ tensor = torch.from_numpy(np.moveaxis(im / (255.0 if im.dtype == np.uint8 else 1), -1, 0).astype(np.float32))
+ if normalize is not None:
+ return F.normalize(tensor, **normalize)
+ return tensor
+
+
+def mask_to_tensor(mask, num_classes, sigmoid):
+ if num_classes > 1:
+ if not sigmoid:
+ # softmax
+ long_mask = np.zeros((mask.shape[:2]), dtype=np.int64)
+ if len(mask.shape) == 3:
+ for c in range(mask.shape[2]):
+ long_mask[mask[..., c] > 0] = c
+ else:
+ long_mask[mask > 127] = 1
+ long_mask[mask == 0] = 0
+ mask = long_mask
+ else:
+ mask = np.moveaxis(mask / (255.0 if mask.dtype == np.uint8 else 1), -1, 0).astype(np.float32)
+ else:
+ mask = np.expand_dims(mask / (255.0 if mask.dtype == np.uint8 else 1), 0).astype(np.float32)
+ return torch.from_numpy(mask)
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/pytorch/transforms.py b/comfyui_controlnet_aux/src/custom_albumentations/pytorch/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..90913963af1189cd0782360bb5fa34cf1aef5715
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/pytorch/transforms.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+
+import warnings
+
+import numpy as np
+import torch
+from torchvision.transforms import functional as F
+
+from ..core.transforms_interface import BasicTransform
+
+__all__ = ["ToTensorV2"]
+
+
+def img_to_tensor(im, normalize=None):
+ tensor = torch.from_numpy(np.moveaxis(im / (255.0 if im.dtype == np.uint8 else 1), -1, 0).astype(np.float32))
+ if normalize is not None:
+ return F.normalize(tensor, **normalize)
+ return tensor
+
+
+def mask_to_tensor(mask, num_classes, sigmoid):
+ if num_classes > 1:
+ if not sigmoid:
+ # softmax
+ long_mask = np.zeros((mask.shape[:2]), dtype=np.int64)
+ if len(mask.shape) == 3:
+ for c in range(mask.shape[2]):
+ long_mask[mask[..., c] > 0] = c
+ else:
+ long_mask[mask > 127] = 1
+ long_mask[mask == 0] = 0
+ mask = long_mask
+ else:
+ mask = np.moveaxis(mask / (255.0 if mask.dtype == np.uint8 else 1), -1, 0).astype(np.float32)
+ else:
+ mask = np.expand_dims(mask / (255.0 if mask.dtype == np.uint8 else 1), 0).astype(np.float32)
+ return torch.from_numpy(mask)
+
+
+class ToTensor(BasicTransform):
+ """Convert image and mask to `torch.Tensor` and divide by 255 if image or mask are `uint8` type.
+ This transform is now removed from custom_albumentations. If you need it downgrade the library to version 0.5.2.
+
+ Args:
+ num_classes (int): only for segmentation
+ sigmoid (bool, optional): only for segmentation, transform mask to LongTensor or not.
+ normalize (dict, optional): dict with keys [mean, std] to pass it into torchvision.normalize
+
+ """
+
+ def __init__(self, num_classes=1, sigmoid=True, normalize=None):
+ raise RuntimeError(
+ "`ToTensor` is obsolete and it was removed from custom_albumentations. Please use `ToTensorV2` instead - "
+ "https://albumentations.ai/docs/api_reference/pytorch/transforms/"
+ "#albumentations.pytorch.transforms.ToTensorV2. "
+ "\n\nIf you need `ToTensor` downgrade Albumentations to version 0.5.2."
+ )
+
+
+class ToTensorV2(BasicTransform):
+ """Convert image and mask to `torch.Tensor`. The numpy `HWC` image is converted to pytorch `CHW` tensor.
+ If the image is in `HW` format (grayscale image), it will be converted to pytorch `HW` tensor.
+ This is a simplified and improved version of the old `ToTensor`
+ transform (`ToTensor` was deprecated, and now it is not present in Albumentations. You should use `ToTensorV2`
+ instead).
+
+ Args:
+ transpose_mask (bool): If True and an input mask has three dimensions, this transform will transpose dimensions
+ so the shape `[height, width, num_channels]` becomes `[num_channels, height, width]`. The latter format is a
+ standard format for PyTorch Tensors. Default: False.
+ always_apply (bool): Indicates whether this transformation should be always applied. Default: True.
+ p (float): Probability of applying the transform. Default: 1.0.
+ """
+
+ def __init__(self, transpose_mask=False, always_apply=True, p=1.0):
+ super(ToTensorV2, self).__init__(always_apply=always_apply, p=p)
+ self.transpose_mask = transpose_mask
+
+ @property
+ def targets(self):
+ return {"image": self.apply, "mask": self.apply_to_mask, "masks": self.apply_to_masks}
+
+ def apply(self, img, **params): # skipcq: PYL-W0613
+ if len(img.shape) not in [2, 3]:
+ raise ValueError("Albumentations only supports images in HW or HWC format")
+
+ if len(img.shape) == 2:
+ img = np.expand_dims(img, 2)
+
+ return torch.from_numpy(img.transpose(2, 0, 1))
+
+ def apply_to_mask(self, mask, **params): # skipcq: PYL-W0613
+ if self.transpose_mask and mask.ndim == 3:
+ mask = mask.transpose(2, 0, 1)
+ return torch.from_numpy(mask)
+
+ def apply_to_masks(self, masks, **params):
+ return [self.apply_to_mask(mask, **params) for mask in masks]
+
+ def get_transform_init_args_names(self):
+ return ("transpose_mask",)
+
+ def get_params_dependent_on_targets(self, params):
+ return {}
diff --git a/comfyui_controlnet_aux/src/custom_albumentations/random_utils.py b/comfyui_controlnet_aux/src/custom_albumentations/random_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b4d52c8a6a39cfcc070dd27e1647e774bf9e782
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_albumentations/random_utils.py
@@ -0,0 +1,96 @@
+# Use `Any` as the return type to avoid mypy problems with Union data types,
+# because numpy can return single number and ndarray
+
+import random as py_random
+from typing import Any, Optional, Sequence, Type, Union
+
+import numpy as np
+
+from .core.transforms_interface import NumType
+
+IntNumType = Union[int, np.ndarray]
+Size = Union[int, Sequence[int]]
+
+
+def get_random_state() -> np.random.RandomState:
+ return np.random.RandomState(py_random.randint(0, (1 << 32) - 1))
+
+
+def uniform(
+ low: NumType = 0.0,
+ high: NumType = 1.0,
+ size: Optional[Size] = None,
+ random_state: Optional[np.random.RandomState] = None,
+) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.uniform(low, high, size)
+
+
+def rand(d0: NumType, d1: NumType, *more, random_state: Optional[np.random.RandomState] = None, **kwargs) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.rand(d0, d1, *more, **kwargs) # type: ignore
+
+
+def randn(d0: NumType, d1: NumType, *more, random_state: Optional[np.random.RandomState] = None, **kwargs) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.randn(d0, d1, *more, **kwargs) # type: ignore
+
+
+def normal(
+ loc: NumType = 0.0,
+ scale: NumType = 1.0,
+ size: Optional[Size] = None,
+ random_state: Optional[np.random.RandomState] = None,
+) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.normal(loc, scale, size)
+
+
+def poisson(
+ lam: NumType = 1.0, size: Optional[Size] = None, random_state: Optional[np.random.RandomState] = None
+) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.poisson(lam, size)
+
+
+def permutation(
+ x: Union[int, Sequence[float], np.ndarray], random_state: Optional[np.random.RandomState] = None
+) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.permutation(x)
+
+
+def randint(
+ low: IntNumType,
+ high: Optional[IntNumType] = None,
+ size: Optional[Size] = None,
+ dtype: Type = np.int32,
+ random_state: Optional[np.random.RandomState] = None,
+) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.randint(low, high, size, dtype)
+
+
+def random(size: Optional[NumType] = None, random_state: Optional[np.random.RandomState] = None) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.random(size) # type: ignore
+
+
+def choice(
+ a: NumType,
+ size: Optional[Size] = None,
+ replace: bool = True,
+ p: Optional[Union[Sequence[float], np.ndarray]] = None,
+ random_state: Optional[np.random.RandomState] = None,
+) -> Any:
+ if random_state is None:
+ random_state = get_random_state()
+ return random_state.choice(a, size, replace, p) # type: ignore
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e7a7f594ef441479257c788e4c0d6e08657fc8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/__init__.py
@@ -0,0 +1 @@
+#Dummy file ensuring this package will be recognized
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b02e08e82688c4ace312faad097f026d1bf96ce
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/__pycache__/util.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/__pycache__/util.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ca7ad28142ad21b2f94b3b328bee948a1556720
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/__pycache__/util.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fa9e2075ec3d1ee2520f513b7d8126c6213290f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/__init__.py
@@ -0,0 +1,66 @@
+from .network import UNet
+from .util import seg2img
+import torch
+import os
+import cv2
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download, BDS_MODEL_NAME
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from einops import rearrange
+from .anime_segmentation import AnimeSegmentation
+import numpy as np
+
+class AnimeFaceSegmentor:
+ def __init__(self, model, seg_model):
+ self.model = model
+ self.seg_model = seg_model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=BDS_MODEL_NAME, filename="UNet.pth", seg_filename="isnetis.ckpt"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder="Annotators")
+ seg_model_path = custom_hf_download("skytnt/anime-seg", seg_filename)
+
+ model = UNet()
+ ckpt = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(ckpt)
+ model.eval()
+
+ seg_model = AnimeSegmentation(seg_model_path)
+ seg_model.net.eval()
+ return cls(model, seg_model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.seg_model.net.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", remove_background=True, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ with torch.no_grad():
+ if remove_background:
+ print(input_image.shape)
+ mask, input_image = self.seg_model(input_image, 0) #Don't resize image as it is resized
+ image_feed = torch.from_numpy(input_image).float().to(self.device)
+ image_feed = rearrange(image_feed, 'h w c -> 1 c h w')
+ image_feed = image_feed / 255
+ seg = self.model(image_feed).squeeze(dim=0)
+ result = seg2img(seg.cpu().detach().numpy())
+
+ detected_map = HWC3(result)
+ detected_map = remove_pad(detected_map)
+ if remove_background:
+ mask = remove_pad(mask)
+ H, W, C = detected_map.shape
+ tmp = np.zeros([H, W, C + 1])
+ tmp[:,:,:C] = detected_map
+ tmp[:,:,3:] = mask
+ detected_map = tmp
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map[..., :3])
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/anime_segmentation.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/anime_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..874fba63d26bd897e008e1ec29af1131bb6d69aa
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/anime_segmentation.py
@@ -0,0 +1,58 @@
+#https://github.com/SkyTNT/anime-segmentation/tree/main
+#Only adapt isnet_is (https://huggingface.co/skytnt/anime-seg/blob/main/isnetis.ckpt)
+import torch.nn as nn
+import torch
+from .isnet import ISNetDIS
+import numpy as np
+import cv2
+from comfy.model_management import get_torch_device
+DEVICE = get_torch_device()
+
+class AnimeSegmentation:
+ def __init__(self, ckpt_path):
+ super(AnimeSegmentation).__init__()
+ sd = torch.load(ckpt_path, map_location="cpu")
+ self.net = ISNetDIS()
+ #gt_encoder isn't used during inference
+ self.net.load_state_dict({k.replace("net.", ''):v for k, v in sd.items() if k.startswith("net.")})
+ self.net = self.net.to(DEVICE)
+ self.net.eval()
+
+ def get_mask(self, input_img, s=640):
+ input_img = (input_img / 255).astype(np.float32)
+ if s == 0:
+ img_input = np.transpose(input_img, (2, 0, 1))
+ img_input = img_input[np.newaxis, :]
+ tmpImg = torch.from_numpy(img_input).float().to(DEVICE)
+ with torch.no_grad():
+ pred = self.net(tmpImg)[0][0].sigmoid() #https://github.com/SkyTNT/anime-segmentation/blob/main/train.py#L92C20-L92C47
+ pred = pred.cpu().numpy()[0]
+ pred = np.transpose(pred, (1, 2, 0))
+ #pred = pred[:, :, np.newaxis]
+ return pred
+
+ h, w = h0, w0 = input_img.shape[:-1]
+ h, w = (s, int(s * w / h)) if h > w else (int(s * h / w), s)
+ ph, pw = s - h, s - w
+ img_input = np.zeros([s, s, 3], dtype=np.float32)
+ img_input[ph // 2:ph // 2 + h, pw // 2:pw // 2 + w] = cv2.resize(input_img, (w, h))
+ img_input = np.transpose(img_input, (2, 0, 1))
+ img_input = img_input[np.newaxis, :]
+ tmpImg = torch.from_numpy(img_input).float().to(DEVICE)
+ with torch.no_grad():
+ pred = self.net(tmpImg)[0][0].sigmoid() #https://github.com/SkyTNT/anime-segmentation/blob/main/train.py#L92C20-L92C47
+ pred = pred.cpu().numpy()[0]
+ pred = np.transpose(pred, (1, 2, 0))
+ pred = pred[ph // 2:ph // 2 + h, pw // 2:pw // 2 + w]
+ #pred = cv2.resize(pred, (w0, h0))[:, :, np.newaxis]
+ pred = cv2.resize(pred, (w0, h0))
+ return pred
+
+ def __call__(self, np_img, img_size):
+ mask = self.get_mask(np_img, int(img_size))
+ np_img = (mask * np_img + 255 * (1 - mask)).astype(np.uint8)
+ mask = (mask * 255).astype(np.uint8)
+ #np_img = np.concatenate([np_img, mask], axis=2, dtype=np.uint8)
+ #mask = mask.repeat(3, axis=2)
+ return mask, np_img
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/isnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/isnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aecabbcc186403d3a221d2d40efa652d230d658
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/isnet.py
@@ -0,0 +1,619 @@
+# Codes are borrowed from
+# https://github.com/xuebinqin/DIS/blob/main/IS-Net/models/isnet.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+
+bce_loss = nn.BCEWithLogitsLoss(reduction="mean")
+
+
+def muti_loss_fusion(preds, target):
+ loss0 = 0.0
+ loss = 0.0
+
+ for i in range(0, len(preds)):
+ if preds[i].shape[2] != target.shape[2] or preds[i].shape[3] != target.shape[3]:
+ tmp_target = F.interpolate(
+ target, size=preds[i].size()[2:], mode="bilinear", align_corners=True
+ )
+ loss = loss + bce_loss(preds[i], tmp_target)
+ else:
+ loss = loss + bce_loss(preds[i], target)
+ if i == 0:
+ loss0 = loss
+ return loss0, loss
+
+
+fea_loss = nn.MSELoss(reduction="mean")
+kl_loss = nn.KLDivLoss(reduction="mean")
+l1_loss = nn.L1Loss(reduction="mean")
+smooth_l1_loss = nn.SmoothL1Loss(reduction="mean")
+
+
+def muti_loss_fusion_kl(preds, target, dfs, fs, mode="MSE"):
+ loss0 = 0.0
+ loss = 0.0
+
+ for i in range(0, len(preds)):
+ if preds[i].shape[2] != target.shape[2] or preds[i].shape[3] != target.shape[3]:
+ tmp_target = F.interpolate(
+ target, size=preds[i].size()[2:], mode="bilinear", align_corners=True
+ )
+ loss = loss + bce_loss(preds[i], tmp_target)
+ else:
+ loss = loss + bce_loss(preds[i], target)
+ if i == 0:
+ loss0 = loss
+
+ for i in range(0, len(dfs)):
+ df = dfs[i]
+ fs_i = fs[i]
+ if mode == "MSE":
+ loss = loss + fea_loss(
+ df, fs_i
+ ) ### add the mse loss of features as additional constraints
+ elif mode == "KL":
+ loss = loss + kl_loss(F.log_softmax(df, dim=1), F.softmax(fs_i, dim=1))
+ elif mode == "MAE":
+ loss = loss + l1_loss(df, fs_i)
+ elif mode == "SmoothL1":
+ loss = loss + smooth_l1_loss(df, fs_i)
+
+ return loss0, loss
+
+
+class REBNCONV(nn.Module):
+ def __init__(self, in_ch=3, out_ch=3, dirate=1, stride=1):
+ super(REBNCONV, self).__init__()
+
+ self.conv_s1 = nn.Conv2d(
+ in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate, stride=stride
+ )
+ self.bn_s1 = nn.BatchNorm2d(out_ch)
+ self.relu_s1 = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ hx = x
+ xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+
+ return xout
+
+
+## upsample tensor 'src' to have the same spatial size with tensor 'tar'
+def _upsample_like(src, tar):
+ src = F.interpolate(src, size=tar.shape[2:], mode="bilinear", align_corners=False)
+
+ return src
+
+
+### RSU-7 ###
+class RSU7(nn.Module):
+ def __init__(self, in_ch=3, mid_ch=12, out_ch=3, img_size=512):
+ super(RSU7, self).__init__()
+
+ self.in_ch = in_ch
+ self.mid_ch = mid_ch
+ self.out_ch = out_ch
+
+ self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) ## 1 -> 1/2
+
+ self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+ self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+ self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+ self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+
+ hx = x
+ hxin = self.rebnconvin(hx)
+
+ hx1 = self.rebnconv1(hxin)
+ hx = self.pool1(hx1)
+
+ hx2 = self.rebnconv2(hx)
+ hx = self.pool2(hx2)
+
+ hx3 = self.rebnconv3(hx)
+ hx = self.pool3(hx3)
+
+ hx4 = self.rebnconv4(hx)
+ hx = self.pool4(hx4)
+
+ hx5 = self.rebnconv5(hx)
+ hx = self.pool5(hx5)
+
+ hx6 = self.rebnconv6(hx)
+
+ hx7 = self.rebnconv7(hx6)
+
+ hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
+ hx6dup = _upsample_like(hx6d, hx5)
+
+ hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
+ hx5dup = _upsample_like(hx5d, hx4)
+
+ hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+ hx4dup = _upsample_like(hx4d, hx3)
+
+ hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+ hx3dup = _upsample_like(hx3d, hx2)
+
+ hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+ hx2dup = _upsample_like(hx2d, hx1)
+
+ hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+
+ return hx1d + hxin
+
+
+### RSU-6 ###
+class RSU6(nn.Module):
+ def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+ super(RSU6, self).__init__()
+
+ self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+ self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+ self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+ self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+ self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+ def forward(self, x):
+ hx = x
+
+ hxin = self.rebnconvin(hx)
+
+ hx1 = self.rebnconv1(hxin)
+ hx = self.pool1(hx1)
+
+ hx2 = self.rebnconv2(hx)
+ hx = self.pool2(hx2)
+
+ hx3 = self.rebnconv3(hx)
+ hx = self.pool3(hx3)
+
+ hx4 = self.rebnconv4(hx)
+ hx = self.pool4(hx4)
+
+ hx5 = self.rebnconv5(hx)
+
+ hx6 = self.rebnconv6(hx5)
+
+ hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
+ hx5dup = _upsample_like(hx5d, hx4)
+
+ hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+ hx4dup = _upsample_like(hx4d, hx3)
+
+ hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+ hx3dup = _upsample_like(hx3d, hx2)
+
+ hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+ hx2dup = _upsample_like(hx2d, hx1)
+
+ hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+
+ return hx1d + hxin
+
+
+### RSU-5 ###
+class RSU5(nn.Module):
+ def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+ super(RSU5, self).__init__()
+
+ self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+ self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+ self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+ self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+ self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+ def forward(self, x):
+ hx = x
+
+ hxin = self.rebnconvin(hx)
+
+ hx1 = self.rebnconv1(hxin)
+ hx = self.pool1(hx1)
+
+ hx2 = self.rebnconv2(hx)
+ hx = self.pool2(hx2)
+
+ hx3 = self.rebnconv3(hx)
+ hx = self.pool3(hx3)
+
+ hx4 = self.rebnconv4(hx)
+
+ hx5 = self.rebnconv5(hx4)
+
+ hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
+ hx4dup = _upsample_like(hx4d, hx3)
+
+ hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+ hx3dup = _upsample_like(hx3d, hx2)
+
+ hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+ hx2dup = _upsample_like(hx2d, hx1)
+
+ hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+
+ return hx1d + hxin
+
+
+### RSU-4 ###
+class RSU4(nn.Module):
+ def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+ super(RSU4, self).__init__()
+
+ self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+ self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+ self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+ self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+ self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+ self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+ self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+ def forward(self, x):
+ hx = x
+
+ hxin = self.rebnconvin(hx)
+
+ hx1 = self.rebnconv1(hxin)
+ hx = self.pool1(hx1)
+
+ hx2 = self.rebnconv2(hx)
+ hx = self.pool2(hx2)
+
+ hx3 = self.rebnconv3(hx)
+
+ hx4 = self.rebnconv4(hx3)
+
+ hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+ hx3dup = _upsample_like(hx3d, hx2)
+
+ hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+ hx2dup = _upsample_like(hx2d, hx1)
+
+ hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+
+ return hx1d + hxin
+
+
+### RSU-4F ###
+class RSU4F(nn.Module):
+ def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+ super(RSU4F, self).__init__()
+
+ self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+ self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+ self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+ self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+
+ self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+
+ self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+ self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+ self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+ def forward(self, x):
+ hx = x
+
+ hxin = self.rebnconvin(hx)
+
+ hx1 = self.rebnconv1(hxin)
+ hx2 = self.rebnconv2(hx1)
+ hx3 = self.rebnconv3(hx2)
+
+ hx4 = self.rebnconv4(hx3)
+
+ hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+ hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
+ hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
+
+ return hx1d + hxin
+
+
+class myrebnconv(nn.Module):
+ def __init__(
+ self,
+ in_ch=3,
+ out_ch=1,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ dilation=1,
+ groups=1,
+ ):
+ super(myrebnconv, self).__init__()
+
+ self.conv = nn.Conv2d(
+ in_ch,
+ out_ch,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups,
+ )
+ self.bn = nn.BatchNorm2d(out_ch)
+ self.rl = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ return self.rl(self.bn(self.conv(x)))
+
+
+class ISNetGTEncoder(nn.Module):
+ def __init__(self, in_ch=1, out_ch=1):
+ super(ISNetGTEncoder, self).__init__()
+
+ self.conv_in = myrebnconv(
+ in_ch, 16, 3, stride=2, padding=1
+ ) # nn.Conv2d(in_ch,64,3,stride=2,padding=1)
+
+ self.stage1 = RSU7(16, 16, 64)
+ self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage2 = RSU6(64, 16, 64)
+ self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage3 = RSU5(64, 32, 128)
+ self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage4 = RSU4(128, 32, 256)
+ self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage5 = RSU4F(256, 64, 512)
+ self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage6 = RSU4F(512, 64, 512)
+
+ self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+ self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+ self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+ self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+ self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+ self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+
+ @staticmethod
+ def compute_loss(args):
+ preds, targets = args
+ return muti_loss_fusion(preds, targets)
+
+ def forward(self, x):
+ hx = x
+
+ hxin = self.conv_in(hx)
+ # hx = self.pool_in(hxin)
+
+ # stage 1
+ hx1 = self.stage1(hxin)
+ hx = self.pool12(hx1)
+
+ # stage 2
+ hx2 = self.stage2(hx)
+ hx = self.pool23(hx2)
+
+ # stage 3
+ hx3 = self.stage3(hx)
+ hx = self.pool34(hx3)
+
+ # stage 4
+ hx4 = self.stage4(hx)
+ hx = self.pool45(hx4)
+
+ # stage 5
+ hx5 = self.stage5(hx)
+ hx = self.pool56(hx5)
+
+ # stage 6
+ hx6 = self.stage6(hx)
+
+ # side output
+ d1 = self.side1(hx1)
+ d1 = _upsample_like(d1, x)
+
+ d2 = self.side2(hx2)
+ d2 = _upsample_like(d2, x)
+
+ d3 = self.side3(hx3)
+ d3 = _upsample_like(d3, x)
+
+ d4 = self.side4(hx4)
+ d4 = _upsample_like(d4, x)
+
+ d5 = self.side5(hx5)
+ d5 = _upsample_like(d5, x)
+
+ d6 = self.side6(hx6)
+ d6 = _upsample_like(d6, x)
+
+ # d0 = self.outconv(torch.cat((d1,d2,d3,d4,d5,d6),1))
+
+ # return [torch.sigmoid(d1), torch.sigmoid(d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(d5), torch.sigmoid(d6)], [hx1, hx2, hx3, hx4, hx5, hx6]
+ return [d1, d2, d3, d4, d5, d6], [hx1, hx2, hx3, hx4, hx5, hx6]
+
+
+class ISNetDIS(nn.Module):
+ def __init__(self, in_ch=3, out_ch=1):
+ super(ISNetDIS, self).__init__()
+
+ self.conv_in = nn.Conv2d(in_ch, 64, 3, stride=2, padding=1)
+ self.pool_in = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage1 = RSU7(64, 32, 64)
+ self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage2 = RSU6(64, 32, 128)
+ self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage3 = RSU5(128, 64, 256)
+ self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage4 = RSU4(256, 128, 512)
+ self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage5 = RSU4F(512, 256, 512)
+ self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+ self.stage6 = RSU4F(512, 256, 512)
+
+ # decoder
+ self.stage5d = RSU4F(1024, 256, 512)
+ self.stage4d = RSU4(1024, 128, 256)
+ self.stage3d = RSU5(512, 64, 128)
+ self.stage2d = RSU6(256, 32, 64)
+ self.stage1d = RSU7(128, 16, 64)
+
+ self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+ self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+ self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+ self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+ self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+ self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+
+ # self.outconv = nn.Conv2d(6*out_ch,out_ch,1)
+
+ @staticmethod
+ def compute_loss_kl(preds, targets, dfs, fs, mode="MSE"):
+ return muti_loss_fusion_kl(preds, targets, dfs, fs, mode=mode)
+
+ @staticmethod
+ def compute_loss(args):
+ if len(args) == 3:
+ ds, dfs, labels = args
+ return muti_loss_fusion(ds, labels)
+ else:
+ ds, dfs, labels, fs = args
+ return muti_loss_fusion_kl(ds, labels, dfs, fs, mode="MSE")
+
+ def forward(self, x):
+ hx = x
+
+ hxin = self.conv_in(hx)
+ hx = self.pool_in(hxin)
+
+ # stage 1
+ hx1 = self.stage1(hxin)
+ hx = self.pool12(hx1)
+
+ # stage 2
+ hx2 = self.stage2(hx)
+ hx = self.pool23(hx2)
+
+ # stage 3
+ hx3 = self.stage3(hx)
+ hx = self.pool34(hx3)
+
+ # stage 4
+ hx4 = self.stage4(hx)
+ hx = self.pool45(hx4)
+
+ # stage 5
+ hx5 = self.stage5(hx)
+ hx = self.pool56(hx5)
+
+ # stage 6
+ hx6 = self.stage6(hx)
+ hx6up = _upsample_like(hx6, hx5)
+
+ # -------------------- decoder --------------------
+ hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+ hx5dup = _upsample_like(hx5d, hx4)
+
+ hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+ hx4dup = _upsample_like(hx4d, hx3)
+
+ hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+ hx3dup = _upsample_like(hx3d, hx2)
+
+ hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+ hx2dup = _upsample_like(hx2d, hx1)
+
+ hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+
+ # side output
+ d1 = self.side1(hx1d)
+ d1 = _upsample_like(d1, x)
+
+ d2 = self.side2(hx2d)
+ d2 = _upsample_like(d2, x)
+
+ d3 = self.side3(hx3d)
+ d3 = _upsample_like(d3, x)
+
+ d4 = self.side4(hx4d)
+ d4 = _upsample_like(d4, x)
+
+ d5 = self.side5(hx5d)
+ d5 = _upsample_like(d5, x)
+
+ d6 = self.side6(hx6)
+ d6 = _upsample_like(d6, x)
+
+ # d0 = self.outconv(torch.cat((d1,d2,d3,d4,d5,d6),1))
+
+ # return [torch.sigmoid(d1), torch.sigmoid(d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(d5), torch.sigmoid(d6)], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6]
+ return [d1, d2, d3, d4, d5, d6], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6]
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/network.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..58254da4f2ed003f5984784c9c9a45318fc4a19d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/network.py
@@ -0,0 +1,100 @@
+#https://github.com/siyeong0/Anime-Face-Segmentation/blob/main/network.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+from custom_controlnet_aux.util import custom_torch_download
+
+class UNet(nn.Module):
+ def __init__(self):
+ super(UNet, self).__init__()
+ self.NUM_SEG_CLASSES = 7 # Background, hair, face, eye, mouth, skin, clothes
+
+ mobilenet_v2 = torchvision.models.mobilenet_v2(pretrained=False)
+ mobilenet_v2.load_state_dict(torch.load(custom_torch_download(filename="mobilenet_v2-b0353104.pth")), strict=True)
+ mob_blocks = mobilenet_v2.features
+
+ # Encoder
+ self.en_block0 = nn.Sequential( # in_ch=3 out_ch=16
+ mob_blocks[0],
+ mob_blocks[1]
+ )
+ self.en_block1 = nn.Sequential( # in_ch=16 out_ch=24
+ mob_blocks[2],
+ mob_blocks[3],
+ )
+ self.en_block2 = nn.Sequential( # in_ch=24 out_ch=32
+ mob_blocks[4],
+ mob_blocks[5],
+ mob_blocks[6],
+ )
+ self.en_block3 = nn.Sequential( # in_ch=32 out_ch=96
+ mob_blocks[7],
+ mob_blocks[8],
+ mob_blocks[9],
+ mob_blocks[10],
+ mob_blocks[11],
+ mob_blocks[12],
+ mob_blocks[13],
+ )
+ self.en_block4 = nn.Sequential( # in_ch=96 out_ch=160
+ mob_blocks[14],
+ mob_blocks[15],
+ mob_blocks[16],
+ )
+
+ # Decoder
+ self.de_block4 = nn.Sequential( # in_ch=160 out_ch=96
+ nn.UpsamplingNearest2d(scale_factor=2),
+ nn.Conv2d(160, 96, kernel_size=3, padding=1),
+ nn.InstanceNorm2d(96),
+ nn.LeakyReLU(0.1),
+ nn.Dropout(p=0.2)
+ )
+ self.de_block3 = nn.Sequential( # in_ch=96x2 out_ch=32
+ nn.UpsamplingNearest2d(scale_factor=2),
+ nn.Conv2d(96*2, 32, kernel_size=3, padding=1),
+ nn.InstanceNorm2d(32),
+ nn.LeakyReLU(0.1),
+ nn.Dropout(p=0.2)
+ )
+ self.de_block2 = nn.Sequential( # in_ch=32x2 out_ch=24
+ nn.UpsamplingNearest2d(scale_factor=2),
+ nn.Conv2d(32*2, 24, kernel_size=3, padding=1),
+ nn.InstanceNorm2d(24),
+ nn.LeakyReLU(0.1),
+ nn.Dropout(p=0.2)
+ )
+ self.de_block1 = nn.Sequential( # in_ch=24x2 out_ch=16
+ nn.UpsamplingNearest2d(scale_factor=2),
+ nn.Conv2d(24*2, 16, kernel_size=3, padding=1),
+ nn.InstanceNorm2d(16),
+ nn.LeakyReLU(0.1),
+ nn.Dropout(p=0.2)
+ )
+
+ self.de_block0 = nn.Sequential( # in_ch=16x2 out_ch=7
+ nn.UpsamplingNearest2d(scale_factor=2),
+ nn.Conv2d(16*2, self.NUM_SEG_CLASSES, kernel_size=3, padding=1),
+ nn.Softmax2d()
+ )
+
+ def forward(self, x):
+ e0 = self.en_block0(x)
+ e1 = self.en_block1(e0)
+ e2 = self.en_block2(e1)
+ e3 = self.en_block3(e2)
+ e4 = self.en_block4(e3)
+
+ d4 = self.de_block4(e4)
+ c4 = torch.cat((d4,e3),1)
+ d3 = self.de_block3(c4)
+ c3 = torch.cat((d3,e2),1)
+ d2 = self.de_block2(c3)
+ c2 =torch.cat((d2,e1),1)
+ d1 = self.de_block1(c2)
+ c1 = torch.cat((d1,e0),1)
+ y = self.de_block0(c1)
+
+ return y
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecbc8b8828e55d7234bf89365368995d4ae5b26b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/anime_face_segment/util.py
@@ -0,0 +1,40 @@
+#https://github.com/siyeong0/Anime-Face-Segmentation/blob/main/util.py
+#The color palette is changed according to https://github.com/Mikubill/sd-webui-controlnet/blob/91f67ddcc7bc47537a6285864abfc12590f46c3f/annotator/anime_face_segment/__init__.py
+import cv2 as cv
+import glob
+import numpy as np
+import os
+
+"""
+COLOR_BACKGROUND = (0,255,255)
+COLOR_HAIR = (255,0,0)
+COLOR_EYE = (0,0,255)
+COLOR_MOUTH = (255,255,255)
+COLOR_FACE = (0,255,0)
+COLOR_SKIN = (255,255,0)
+COLOR_CLOTHES = (255,0,255)
+"""
+COLOR_BACKGROUND = (255,255,0)
+COLOR_HAIR = (0,0,255)
+COLOR_EYE = (255,0,0)
+COLOR_MOUTH = (255,255,255)
+COLOR_FACE = (0,255,0)
+COLOR_SKIN = (0,255,255)
+COLOR_CLOTHES = (255,0,255)
+PALETTE = [COLOR_BACKGROUND,COLOR_HAIR,COLOR_EYE,COLOR_MOUTH,COLOR_FACE,COLOR_SKIN,COLOR_CLOTHES]
+
+def img2seg(path):
+ src = cv.imread(path)
+ src = src.reshape(-1, 3)
+ seg_list = []
+ for color in PALETTE:
+ seg_list.append(np.where(np.all(src==color, axis=1), 1.0, 0.0))
+ dst = np.stack(seg_list,axis=1).reshape(512,512,7)
+
+ return dst.astype(np.float32)
+
+def seg2img(src):
+ src = np.moveaxis(src,0,2)
+ dst = [[PALETTE[np.argmax(val)] for val in buf]for buf in src]
+
+ return np.array(dst).astype(np.uint8)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/binary/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/binary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba14aef66eb2c949fad96a5247069f941c5fa399
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/binary/__init__.py
@@ -0,0 +1,38 @@
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad
+
+class BinaryDetector:
+ def __call__(self, input_image=None, bin_threshold=0, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ if "img" in kwargs:
+ warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
+ input_image = kwargs.pop("img")
+
+ if input_image is None:
+ raise ValueError("input_image must be defined.")
+
+ if not isinstance(input_image, np.ndarray):
+ input_image = np.array(input_image, dtype=np.uint8)
+ output_type = output_type or "pil"
+ else:
+ output_type = output_type or "np"
+
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ img_gray = cv2.cvtColor(detected_map, cv2.COLOR_RGB2GRAY)
+ if bin_threshold == 0 or bin_threshold == 255:
+ # Otsu's threshold
+ otsu_threshold, img_bin = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+ print("Otsu threshold:", otsu_threshold)
+ else:
+ _, img_bin = cv2.threshold(img_gray, bin_threshold, 255, cv2.THRESH_BINARY_INV)
+
+ detected_map = cv2.cvtColor(img_bin, cv2.COLOR_GRAY2RGB)
+ detected_map = HWC3(remove_pad(255 - detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/canny/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/canny/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..17477aa13d03399e1af86855a0baf47625ddd579
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/canny/__init__.py
@@ -0,0 +1,17 @@
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import resize_image_with_pad, common_input_validate, HWC3
+
+class CannyDetector:
+ def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ detected_map = cv2.Canny(detected_map, low_threshold, high_threshold)
+ detected_map = HWC3(remove_pad(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/color/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/color/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a89e025397696f6b2317ddd3801765ccad958a4c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/color/__init__.py
@@ -0,0 +1,37 @@
+import cv2
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, safer_memory, common_input_validate
+
+def cv2_resize_shortest_edge(image, size):
+ h, w = image.shape[:2]
+ if h < w:
+ new_h = size
+ new_w = int(round(w / h * size))
+ else:
+ new_w = size
+ new_h = int(round(h / w * size))
+ resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
+ return resized_image
+
+def apply_color(img, res=512):
+ img = cv2_resize_shortest_edge(img, res)
+ h, w = img.shape[:2]
+
+ input_img_color = cv2.resize(img, (w//64, h//64), interpolation=cv2.INTER_CUBIC)
+ input_img_color = cv2.resize(input_img_color, (w, h), interpolation=cv2.INTER_NEAREST)
+ return input_img_color
+
+#Color T2I like multiples-of-64, upscale methods are fixed.
+class ColorDetector:
+ def __call__(self, input_image=None, detect_resolution=512, output_type=None, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image = HWC3(input_image)
+ detected_map = HWC3(apply_color(input_image, detect_resolution))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/densepose/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/densepose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c94ba9a7124a979cc3582a42b1ee40710ffd2af1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/densepose/__init__.py
@@ -0,0 +1,66 @@
+import torchvision # Fix issue Unknown builtin op: torchvision::nms
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download, DENSEPOSE_MODEL_NAME
+from .densepose import DensePoseMaskedColormapResultsVisualizer, _extract_i_from_iuvarr, densepose_chart_predictor_output_to_result_with_confidences
+
+N_PART_LABELS = 24
+
+class DenseposeDetector:
+ def __init__(self, model):
+ self.dense_pose_estimation = model
+ self.device = "cpu"
+ self.result_visualizer = DensePoseMaskedColormapResultsVisualizer(
+ alpha=1,
+ data_extractor=_extract_i_from_iuvarr,
+ segm_extractor=_extract_i_from_iuvarr,
+ val_scale = 255.0 / N_PART_LABELS
+ )
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=DENSEPOSE_MODEL_NAME, filename="densepose_r50_fpn_dl.torchscript"):
+ torchscript_model_path = custom_hf_download(pretrained_model_or_path, filename)
+ densepose = torch.jit.load(torchscript_model_path, map_location="cpu")
+ return cls(densepose)
+
+ def to(self, device):
+ self.dense_pose_estimation.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", cmap="viridis", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ H, W = input_image.shape[:2]
+
+ hint_image_canvas = np.zeros([H, W], dtype=np.uint8)
+ hint_image_canvas = np.tile(hint_image_canvas[:, :, np.newaxis], [1, 1, 3])
+
+ input_image = rearrange(torch.from_numpy(input_image).to(self.device), 'h w c -> c h w')
+
+ pred_boxes, corase_segm, fine_segm, u, v = self.dense_pose_estimation(input_image)
+
+ extractor = densepose_chart_predictor_output_to_result_with_confidences
+ densepose_results = [extractor(pred_boxes[i:i+1], corase_segm[i:i+1], fine_segm[i:i+1], u[i:i+1], v[i:i+1]) for i in range(len(pred_boxes))]
+
+ if cmap=="viridis":
+ self.result_visualizer.mask_visualizer.cmap = cv2.COLORMAP_VIRIDIS
+ hint_image = self.result_visualizer.visualize(hint_image_canvas, densepose_results)
+ hint_image = cv2.cvtColor(hint_image, cv2.COLOR_BGR2RGB)
+ hint_image[:, :, 0][hint_image[:, :, 0] == 0] = 68
+ hint_image[:, :, 1][hint_image[:, :, 1] == 0] = 1
+ hint_image[:, :, 2][hint_image[:, :, 2] == 0] = 84
+ else:
+ self.result_visualizer.mask_visualizer.cmap = cv2.COLORMAP_PARULA
+ hint_image = self.result_visualizer.visualize(hint_image_canvas, densepose_results)
+ hint_image = cv2.cvtColor(hint_image, cv2.COLOR_BGR2RGB)
+
+ detected_map = remove_pad(HWC3(hint_image))
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/densepose/densepose.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/densepose/densepose.py
new file mode 100644
index 0000000000000000000000000000000000000000..64244f9c9b6bfd614ef0fa742aeeef3b7a599495
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/densepose/densepose.py
@@ -0,0 +1,347 @@
+from typing import Tuple
+import math
+import numpy as np
+from enum import IntEnum
+from typing import List, Tuple, Union
+import torch
+from torch.nn import functional as F
+import logging
+import cv2
+
+Image = np.ndarray
+Boxes = torch.Tensor
+ImageSizeType = Tuple[int, int]
+_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
+IntTupleBox = Tuple[int, int, int, int]
+
+class BoxMode(IntEnum):
+ """
+ Enum of different ways to represent a box.
+ """
+
+ XYXY_ABS = 0
+ """
+ (x0, y0, x1, y1) in absolute floating points coordinates.
+ The coordinates in range [0, width or height].
+ """
+ XYWH_ABS = 1
+ """
+ (x0, y0, w, h) in absolute floating points coordinates.
+ """
+ XYXY_REL = 2
+ """
+ Not yet supported!
+ (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
+ """
+ XYWH_REL = 3
+ """
+ Not yet supported!
+ (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
+ """
+ XYWHA_ABS = 4
+ """
+ (xc, yc, w, h, a) in absolute floating points coordinates.
+ (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
+ """
+
+ @staticmethod
+ def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
+ """
+ Args:
+ box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
+ from_mode, to_mode (BoxMode)
+
+ Returns:
+ The converted box of the same type.
+ """
+ if from_mode == to_mode:
+ return box
+
+ original_type = type(box)
+ is_numpy = isinstance(box, np.ndarray)
+ single_box = isinstance(box, (list, tuple))
+ if single_box:
+ assert len(box) == 4 or len(box) == 5, (
+ "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
+ " where k == 4 or 5"
+ )
+ arr = torch.tensor(box)[None, :]
+ else:
+ # avoid modifying the input box
+ if is_numpy:
+ arr = torch.from_numpy(np.asarray(box)).clone()
+ else:
+ arr = box.clone()
+
+ assert to_mode not in [BoxMode.XYXY_REL, BoxMode.XYWH_REL] and from_mode not in [
+ BoxMode.XYXY_REL,
+ BoxMode.XYWH_REL,
+ ], "Relative mode not yet supported!"
+
+ if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
+ assert (
+ arr.shape[-1] == 5
+ ), "The last dimension of input shape must be 5 for XYWHA format"
+ original_dtype = arr.dtype
+ arr = arr.double()
+
+ w = arr[:, 2]
+ h = arr[:, 3]
+ a = arr[:, 4]
+ c = torch.abs(torch.cos(a * math.pi / 180.0))
+ s = torch.abs(torch.sin(a * math.pi / 180.0))
+ # This basically computes the horizontal bounding rectangle of the rotated box
+ new_w = c * w + s * h
+ new_h = c * h + s * w
+
+ # convert center to top-left corner
+ arr[:, 0] -= new_w / 2.0
+ arr[:, 1] -= new_h / 2.0
+ # bottom-right corner
+ arr[:, 2] = arr[:, 0] + new_w
+ arr[:, 3] = arr[:, 1] + new_h
+
+ arr = arr[:, :4].to(dtype=original_dtype)
+ elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
+ original_dtype = arr.dtype
+ arr = arr.double()
+ arr[:, 0] += arr[:, 2] / 2.0
+ arr[:, 1] += arr[:, 3] / 2.0
+ angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
+ arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
+ else:
+ if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
+ arr[:, 2] += arr[:, 0]
+ arr[:, 3] += arr[:, 1]
+ elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
+ arr[:, 2] -= arr[:, 0]
+ arr[:, 3] -= arr[:, 1]
+ else:
+ raise NotImplementedError(
+ "Conversion from BoxMode {} to {} is not supported yet".format(
+ from_mode, to_mode
+ )
+ )
+
+ if single_box:
+ return original_type(arr.flatten().tolist())
+ if is_numpy:
+ return arr.numpy()
+ else:
+ return arr
+
+class MatrixVisualizer:
+ """
+ Base visualizer for matrix data
+ """
+
+ def __init__(
+ self,
+ inplace=True,
+ cmap=cv2.COLORMAP_PARULA,
+ val_scale=1.0,
+ alpha=0.7,
+ interp_method_matrix=cv2.INTER_LINEAR,
+ interp_method_mask=cv2.INTER_NEAREST,
+ ):
+ self.inplace = inplace
+ self.cmap = cmap
+ self.val_scale = val_scale
+ self.alpha = alpha
+ self.interp_method_matrix = interp_method_matrix
+ self.interp_method_mask = interp_method_mask
+
+ def visualize(self, image_bgr, mask, matrix, bbox_xywh):
+ self._check_image(image_bgr)
+ self._check_mask_matrix(mask, matrix)
+ if self.inplace:
+ image_target_bgr = image_bgr
+ else:
+ image_target_bgr = image_bgr * 0
+ x, y, w, h = [int(v) for v in bbox_xywh]
+ if w <= 0 or h <= 0:
+ return image_bgr
+ mask, matrix = self._resize(mask, matrix, w, h)
+ mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
+ matrix_scaled = matrix.astype(np.float32) * self.val_scale
+ _EPSILON = 1e-6
+ if np.any(matrix_scaled > 255 + _EPSILON):
+ logger = logging.getLogger(__name__)
+ logger.warning(
+ f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
+ )
+ matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
+ matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
+ matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
+ image_target_bgr[y : y + h, x : x + w, :] = (
+ image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
+ )
+ return image_target_bgr.astype(np.uint8)
+
+ def _resize(self, mask, matrix, w, h):
+ if (w != mask.shape[1]) or (h != mask.shape[0]):
+ mask = cv2.resize(mask, (w, h), self.interp_method_mask)
+ if (w != matrix.shape[1]) or (h != matrix.shape[0]):
+ matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
+ return mask, matrix
+
+ def _check_image(self, image_rgb):
+ assert len(image_rgb.shape) == 3
+ assert image_rgb.shape[2] == 3
+ assert image_rgb.dtype == np.uint8
+
+ def _check_mask_matrix(self, mask, matrix):
+ assert len(matrix.shape) == 2
+ assert len(mask.shape) == 2
+ assert mask.dtype == np.uint8
+
+class DensePoseResultsVisualizer:
+ def visualize(
+ self,
+ image_bgr: Image,
+ results,
+ ) -> Image:
+ context = self.create_visualization_context(image_bgr)
+ for i, result in enumerate(results):
+ boxes_xywh, labels, uv = result
+ iuv_array = torch.cat(
+ (labels[None].type(torch.float32), uv * 255.0)
+ ).type(torch.uint8)
+ self.visualize_iuv_arr(context, iuv_array.cpu().numpy(), boxes_xywh)
+ image_bgr = self.context_to_image_bgr(context)
+ return image_bgr
+
+ def create_visualization_context(self, image_bgr: Image):
+ return image_bgr
+
+ def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
+ pass
+
+ def context_to_image_bgr(self, context):
+ return context
+
+ def get_image_bgr_from_context(self, context):
+ return context
+
+class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
+ def __init__(
+ self,
+ data_extractor,
+ segm_extractor,
+ inplace=True,
+ cmap=cv2.COLORMAP_PARULA,
+ alpha=0.7,
+ val_scale=1.0,
+ **kwargs,
+ ):
+ self.mask_visualizer = MatrixVisualizer(
+ inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+ )
+ self.data_extractor = data_extractor
+ self.segm_extractor = segm_extractor
+
+ def context_to_image_bgr(self, context):
+ return context
+
+ def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
+ image_bgr = self.get_image_bgr_from_context(context)
+ matrix = self.data_extractor(iuv_arr)
+ segm = self.segm_extractor(iuv_arr)
+ mask = np.zeros(matrix.shape, dtype=np.uint8)
+ mask[segm > 0] = 1
+ image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+
+
+def _extract_i_from_iuvarr(iuv_arr):
+ return iuv_arr[0, :, :]
+
+
+def _extract_u_from_iuvarr(iuv_arr):
+ return iuv_arr[1, :, :]
+
+
+def _extract_v_from_iuvarr(iuv_arr):
+ return iuv_arr[2, :, :]
+
+def make_int_box(box: torch.Tensor) -> IntTupleBox:
+ int_box = [0, 0, 0, 0]
+ int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist())
+ return int_box[0], int_box[1], int_box[2], int_box[3]
+
+def densepose_chart_predictor_output_to_result_with_confidences(
+ boxes: Boxes,
+ coarse_segm,
+ fine_segm,
+ u, v
+
+):
+ boxes_xyxy_abs = boxes.clone()
+ boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+ box_xywh = make_int_box(boxes_xywh_abs[0])
+
+ labels = resample_fine_and_coarse_segm_tensors_to_bbox(fine_segm, coarse_segm, box_xywh).squeeze(0)
+ uv = resample_uv_tensors_to_bbox(u, v, labels, box_xywh)
+ confidences = []
+ return box_xywh, labels, uv
+
+def resample_fine_and_coarse_segm_tensors_to_bbox(
+ fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox
+):
+ """
+ Resample fine and coarse segmentation tensors to the given
+ bounding box and derive labels for each pixel of the bounding box
+
+ Args:
+ fine_segm: float tensor of shape [1, C, Hout, Wout]
+ coarse_segm: float tensor of shape [1, K, Hout, Wout]
+ box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+ corner coordinates, width (W) and height (H)
+ Return:
+ Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+ """
+ x, y, w, h = box_xywh_abs
+ w = max(int(w), 1)
+ h = max(int(h), 1)
+ # coarse segmentation
+ coarse_segm_bbox = F.interpolate(
+ coarse_segm,
+ (h, w),
+ mode="bilinear",
+ align_corners=False,
+ ).argmax(dim=1)
+ # combined coarse and fine segmentation
+ labels = (
+ F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+ * (coarse_segm_bbox > 0).long()
+ )
+ return labels
+
+def resample_uv_tensors_to_bbox(
+ u: torch.Tensor,
+ v: torch.Tensor,
+ labels: torch.Tensor,
+ box_xywh_abs: IntTupleBox,
+) -> torch.Tensor:
+ """
+ Resamples U and V coordinate estimates for the given bounding box
+
+ Args:
+ u (tensor [1, C, H, W] of float): U coordinates
+ v (tensor [1, C, H, W] of float): V coordinates
+ labels (tensor [H, W] of long): labels obtained by resampling segmentation
+ outputs for the given bounding box
+ box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+ Return:
+ Resampled U and V coordinates - a tensor [2, H, W] of float
+ """
+ x, y, w, h = box_xywh_abs
+ w = max(int(w), 1)
+ h = max(int(h), 1)
+ u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False)
+ v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False)
+ uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device)
+ for part_id in range(1, u_bbox.size(1)):
+ uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]
+ uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id]
+ return uv
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b3bc6f983502e26cea8d72537a0e7f262d2874
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/__init__.py
@@ -0,0 +1,71 @@
+import numpy as np
+import torch
+from einops import repeat
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, DEPTH_ANYTHING_MODEL_NAME
+from custom_controlnet_aux.depth_anything.depth_anything.dpt import DPT_DINOv2
+from custom_controlnet_aux.depth_anything.depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
+from torchvision.transforms import Compose
+import cv2
+import torch.nn.functional as F
+
+transform = Compose([
+ Resize(
+ width=518,
+ height=518,
+ resize_target=False,
+ keep_aspect_ratio=True,
+ ensure_multiple_of=14,
+ resize_method='lower_bound',
+ image_interpolation_method=cv2.INTER_CUBIC,
+ ),
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+ PrepareForNet(),
+])
+
+#https://huggingface.co/LiheYoung/depth_anything_vitl14/raw/main/config.json
+DPT_CONFIGS = {
+ "depth_anything_vitl14.pth": {"encoder": "vitl", "features": 256, "out_channels": [256, 512, 1024, 1024], "use_bn": False, "use_clstoken": False},
+ "depth_anything_vitb14.pth": {"encoder": "vitb", "features": 128, "out_channels": [96, 192, 384, 768], "use_bn": False, "use_clstoken": False},
+ "depth_anything_vits14.pth": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384], "use_bn": False, "use_clstoken": False}
+}
+
+class DepthAnythingDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=DEPTH_ANYTHING_MODEL_NAME, filename="depth_anything_vitl14.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder="checkpoints", repo_type="space")
+ model = DPT_DINOv2(**DPT_CONFIGS[filename], localhub=True)
+ model.load_state_dict(torch.load(model_path, map_location="cpu"))
+ model.eval()
+
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ t, remove_pad = resize_image_with_pad(np.zeros_like(input_image), detect_resolution, upscale_method)
+ t = remove_pad(t)
+
+ h, w = t.shape[:2]
+ h, w = int(h), int(w)
+ image = transform({'image': input_image / 255.})['image']
+ image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+
+ with torch.no_grad():
+ depth = self.model(image)
+ depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+
+ detected_map = repeat(depth, "h w -> h w 3").cpu().numpy().astype(np.uint8)
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/blocks.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a758aa2984515abf440c304ba005f01f391480b9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/blocks.py
@@ -0,0 +1,153 @@
+import torch.nn as nn
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+ scratch = nn.Module()
+
+ out_shape1 = out_shape
+ out_shape2 = out_shape
+ out_shape3 = out_shape
+ if len(in_shape) >= 4:
+ out_shape4 = out_shape
+
+ if expand:
+ out_shape1 = out_shape
+ out_shape2 = out_shape*2
+ out_shape3 = out_shape*4
+ if len(in_shape) >= 4:
+ out_shape4 = out_shape*8
+
+ scratch.layer1_rn = nn.Conv2d(
+ in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+ scratch.layer2_rn = nn.Conv2d(
+ in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+ scratch.layer3_rn = nn.Conv2d(
+ in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+ if len(in_shape) >= 4:
+ scratch.layer4_rn = nn.Conv2d(
+ in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+
+ return scratch
+
+
+class ResidualConvUnit(nn.Module):
+ """Residual convolution module.
+ """
+
+ def __init__(self, features, activation, bn):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.bn = bn
+
+ self.groups=1
+
+ self.conv1 = nn.Conv2d(
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+ )
+
+ self.conv2 = nn.Conv2d(
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+ )
+
+ if self.bn==True:
+ self.bn1 = nn.BatchNorm2d(features)
+ self.bn2 = nn.BatchNorm2d(features)
+
+ self.activation = activation
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ """Forward pass.
+
+ Args:
+ x (tensor): input
+
+ Returns:
+ tensor: output
+ """
+
+ out = self.activation(x)
+ out = self.conv1(out)
+ if self.bn==True:
+ out = self.bn1(out)
+
+ out = self.activation(out)
+ out = self.conv2(out)
+ if self.bn==True:
+ out = self.bn2(out)
+
+ if self.groups > 1:
+ out = self.conv_merge(out)
+
+ return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock(nn.Module):
+ """Feature fusion block.
+ """
+
+ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock, self).__init__()
+
+ self.deconv = deconv
+ self.align_corners = align_corners
+
+ self.groups=1
+
+ self.expand = expand
+ out_features = features
+ if self.expand==True:
+ out_features = features//2
+
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+
+ self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+ self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ self.size=size
+
+ def forward(self, *xs, size=None):
+ """Forward pass.
+
+ Returns:
+ tensor: output
+ """
+ output = xs[0]
+
+ if len(xs) == 2:
+ res = self.resConfUnit1(xs[1])
+ output = self.skip_add.add(output, res)
+
+ output = self.resConfUnit2(output)
+
+ if (size is None) and (self.size is None):
+ modifier = {"scale_factor": 2}
+ elif size is None:
+ modifier = {"size": self.size}
+ else:
+ modifier = {"size": size}
+
+ output = nn.functional.interpolate(
+ output, **modifier, mode="bilinear", align_corners=self.align_corners
+ )
+
+ output = self.out_conv(output)
+
+ return output
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/dpt.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..4669eec785c66697f253de44b26ed77865d80399
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/dpt.py
@@ -0,0 +1,171 @@
+import torch
+import torch.nn as nn
+
+from .blocks import FeatureFusionBlock, _make_scratch
+import torch.nn.functional as F
+from custom_controlnet_aux.util import TORCHHUB_PATH
+
+
+def _make_fusion_block(features, use_bn, size = None):
+ return FeatureFusionBlock(
+ features,
+ nn.ReLU(False),
+ deconv=False,
+ bn=use_bn,
+ expand=False,
+ align_corners=True,
+ size=size,
+ )
+
+
+class DPTHead(nn.Module):
+ def __init__(self, nclass, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False):
+ super(DPTHead, self).__init__()
+
+ self.nclass = nclass
+ self.use_clstoken = use_clstoken
+
+ self.projects = nn.ModuleList([
+ nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_channel,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ) for out_channel in out_channels
+ ])
+
+ self.resize_layers = nn.ModuleList([
+ nn.ConvTranspose2d(
+ in_channels=out_channels[0],
+ out_channels=out_channels[0],
+ kernel_size=4,
+ stride=4,
+ padding=0),
+ nn.ConvTranspose2d(
+ in_channels=out_channels[1],
+ out_channels=out_channels[1],
+ kernel_size=2,
+ stride=2,
+ padding=0),
+ nn.Identity(),
+ nn.Conv2d(
+ in_channels=out_channels[3],
+ out_channels=out_channels[3],
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ ])
+
+ if use_clstoken:
+ self.readout_projects = nn.ModuleList()
+ for _ in range(len(self.projects)):
+ self.readout_projects.append(
+ nn.Sequential(
+ nn.Linear(2 * in_channels, in_channels),
+ nn.GELU()))
+
+ self.scratch = _make_scratch(
+ out_channels,
+ features,
+ groups=1,
+ expand=False,
+ )
+
+ self.scratch.stem_transpose = None
+
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+ head_features_1 = features
+ head_features_2 = 32
+
+ if nclass > 1:
+ self.scratch.output_conv = nn.Sequential(
+ nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(True),
+ nn.Conv2d(head_features_1, nclass, kernel_size=1, stride=1, padding=0),
+ )
+ else:
+ self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+
+ self.scratch.output_conv2 = nn.Sequential(
+ nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(True),
+ nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+ nn.ReLU(True),
+ nn.Identity(),
+ )
+
+ def forward(self, out_features, patch_h, patch_w):
+ out = []
+ for i, x in enumerate(out_features):
+ if self.use_clstoken:
+ x, cls_token = x[0], x[1]
+ readout = cls_token.unsqueeze(1).expand_as(x)
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
+ else:
+ x = x[0]
+
+ x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+
+ x = self.projects[i](x)
+ x = self.resize_layers[i](x)
+
+ out.append(x)
+
+ layer_1, layer_2, layer_3, layer_4 = out
+
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+ path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+ out = self.scratch.output_conv1(path_1)
+ out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+ out = self.scratch.output_conv2(out)
+
+ return out
+
+
+class DPT_DINOv2(nn.Module):
+ def __init__(self, encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024], use_bn=False, use_clstoken=False, localhub=True):
+ super(DPT_DINOv2, self).__init__()
+
+ assert encoder in ['vits', 'vitb', 'vitl']
+
+ # in case the Internet connection is not stable, please load the DINOv2 locally
+ if localhub:
+ self.pretrained = torch.hub.load(TORCHHUB_PATH / 'facebookresearch_dinov2_main', 'dinov2_{:}14'.format(encoder), source='local', pretrained=False)
+ else:
+ self.pretrained = torch.hub.load('facebookresearch/dinov2', 'dinov2_{:}14'.format(encoder), )
+
+ dim = self.pretrained.blocks[0].attn.qkv.in_features
+
+ self.depth_head = DPTHead(1, dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+
+ def forward(self, x):
+ h, w = x.shape[-2:]
+
+ features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True)
+
+ patch_h, patch_w = h // 14, w // 14
+
+ depth = self.depth_head(features, patch_h, patch_w)
+ depth = F.interpolate(depth, size=(h, w), mode="bilinear", align_corners=True)
+ depth = F.relu(depth)
+
+ return depth.squeeze(1)
+
+
+if __name__ == '__main__':
+ depth_anything = DPT_DINOv2()
+ depth_anything.load_state_dict(torch.load('checkpoints/depth_anything_dinov2_vitl14.pth'))
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/util/transform.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/util/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ce7bee651eb9675dba38fb2643e7ba69fcab0a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/depth_anything/util/transform.py
@@ -0,0 +1,248 @@
+import random
+from PIL import Image, ImageOps, ImageFilter
+import torch
+from torchvision import transforms
+import torch.nn.functional as F
+
+import numpy as np
+import cv2
+import math
+
+
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+ """Rezise the sample to ensure the given size. Keeps aspect ratio.
+
+ Args:
+ sample (dict): sample
+ size (tuple): image size
+
+ Returns:
+ tuple: new size
+ """
+ shape = list(sample["disparity"].shape)
+
+ if shape[0] >= size[0] and shape[1] >= size[1]:
+ return sample
+
+ scale = [0, 0]
+ scale[0] = size[0] / shape[0]
+ scale[1] = size[1] / shape[1]
+
+ scale = max(scale)
+
+ shape[0] = math.ceil(scale * shape[0])
+ shape[1] = math.ceil(scale * shape[1])
+
+ # resize
+ sample["image"] = cv2.resize(
+ sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+ )
+
+ sample["disparity"] = cv2.resize(
+ sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+ )
+ sample["mask"] = cv2.resize(
+ sample["mask"].astype(np.float32),
+ tuple(shape[::-1]),
+ interpolation=cv2.INTER_NEAREST,
+ )
+ sample["mask"] = sample["mask"].astype(bool)
+
+ return tuple(shape)
+
+
+class Resize(object):
+ """Resize sample to given size (width, height).
+ """
+
+ def __init__(
+ self,
+ width,
+ height,
+ resize_target=True,
+ keep_aspect_ratio=False,
+ ensure_multiple_of=1,
+ resize_method="lower_bound",
+ image_interpolation_method=cv2.INTER_AREA,
+ ):
+ """Init.
+
+ Args:
+ width (int): desired output width
+ height (int): desired output height
+ resize_target (bool, optional):
+ True: Resize the full sample (image, mask, target).
+ False: Resize image only.
+ Defaults to True.
+ keep_aspect_ratio (bool, optional):
+ True: Keep the aspect ratio of the input sample.
+ Output sample might not have the given width and height, and
+ resize behaviour depends on the parameter 'resize_method'.
+ Defaults to False.
+ ensure_multiple_of (int, optional):
+ Output width and height is constrained to be multiple of this parameter.
+ Defaults to 1.
+ resize_method (str, optional):
+ "lower_bound": Output will be at least as large as the given size.
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
+ Defaults to "lower_bound".
+ """
+ self.__width = width
+ self.__height = height
+
+ self.__resize_target = resize_target
+ self.__keep_aspect_ratio = keep_aspect_ratio
+ self.__multiple_of = ensure_multiple_of
+ self.__resize_method = resize_method
+ self.__image_interpolation_method = image_interpolation_method
+
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ if max_val is not None and y > max_val:
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ if y < min_val:
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ return y
+
+ def get_size(self, width, height):
+ # determine new height and width
+ scale_height = self.__height / height
+ scale_width = self.__width / width
+
+ if self.__keep_aspect_ratio:
+ if self.__resize_method == "lower_bound":
+ # scale such that output size is lower bound
+ if scale_width > scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "upper_bound":
+ # scale such that output size is upper bound
+ if scale_width < scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "minimal":
+ # scale as least as possbile
+ if abs(1 - scale_width) < abs(1 - scale_height):
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ else:
+ raise ValueError(
+ f"resize_method {self.__resize_method} not implemented"
+ )
+
+ if self.__resize_method == "lower_bound":
+ new_height = self.constrain_to_multiple_of(
+ scale_height * height, min_val=self.__height
+ )
+ new_width = self.constrain_to_multiple_of(
+ scale_width * width, min_val=self.__width
+ )
+ elif self.__resize_method == "upper_bound":
+ new_height = self.constrain_to_multiple_of(
+ scale_height * height, max_val=self.__height
+ )
+ new_width = self.constrain_to_multiple_of(
+ scale_width * width, max_val=self.__width
+ )
+ elif self.__resize_method == "minimal":
+ new_height = self.constrain_to_multiple_of(scale_height * height)
+ new_width = self.constrain_to_multiple_of(scale_width * width)
+ else:
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
+
+ return (new_width, new_height)
+
+ def __call__(self, sample):
+ width, height = self.get_size(
+ sample["image"].shape[1], sample["image"].shape[0]
+ )
+
+ # resize sample
+ sample["image"] = cv2.resize(
+ sample["image"],
+ (width, height),
+ interpolation=self.__image_interpolation_method,
+ )
+
+ if self.__resize_target:
+ if "disparity" in sample:
+ sample["disparity"] = cv2.resize(
+ sample["disparity"],
+ (width, height),
+ interpolation=cv2.INTER_NEAREST,
+ )
+
+ if "depth" in sample:
+ sample["depth"] = cv2.resize(
+ sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+ )
+
+ if "semseg_mask" in sample:
+ # sample["semseg_mask"] = cv2.resize(
+ # sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
+ # )
+ sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode='nearest').numpy()[0, 0]
+
+ if "mask" in sample:
+ sample["mask"] = cv2.resize(
+ sample["mask"].astype(np.float32),
+ (width, height),
+ interpolation=cv2.INTER_NEAREST,
+ )
+ # sample["mask"] = sample["mask"].astype(bool)
+
+ # print(sample['image'].shape, sample['depth'].shape)
+ return sample
+
+
+class NormalizeImage(object):
+ """Normlize image by given mean and std.
+ """
+
+ def __init__(self, mean, std):
+ self.__mean = mean
+ self.__std = std
+
+ def __call__(self, sample):
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
+
+ return sample
+
+
+class PrepareForNet(object):
+ """Prepare sample for usage as network input.
+ """
+
+ def __init__(self):
+ pass
+
+ def __call__(self, sample):
+ image = np.transpose(sample["image"], (2, 0, 1))
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+
+ if "mask" in sample:
+ sample["mask"] = sample["mask"].astype(np.float32)
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
+
+ if "depth" in sample:
+ depth = sample["depth"].astype(np.float32)
+ sample["depth"] = np.ascontiguousarray(depth)
+
+ if "semseg_mask" in sample:
+ sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
+ sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
+
+ return sample
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/README.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..407ad6c68f98e146f870e77590cadcfed16b2c29
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/README.md
@@ -0,0 +1,3 @@
+# Local PyTorch Hub
+
+This directory is for loading the DINOv2 encoder locally in case of no Internet connection.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad756270428cffe88b95f0b1f7d8aaf40dddb9db
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at . All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd4bb72c671ea5f74cef97beb3707747f28655a6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to DINOv2
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here:
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to DINOv2, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6808d9ae496f4a546dffaea9acf3f8b8896136e4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/LICENSE
@@ -0,0 +1,400 @@
+
+Attribution-NonCommercial 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+ Considerations for licensors: Our public licenses are
+ intended for use by those authorized to give the public
+ permission to use material in ways otherwise restricted by
+ copyright and certain other rights. Our licenses are
+ irrevocable. Licensors should read and understand the terms
+ and conditions of the license they choose before applying it.
+ Licensors should also secure all rights necessary before
+ applying our licenses so that the public can reuse the
+ material as expected. Licensors should clearly mark any
+ material not subject to the license. This includes other CC-
+ licensed material, or material used under an exception or
+ limitation to copyright. More considerations for licensors:
+ wiki.creativecommons.org/Considerations_for_licensors
+
+ Considerations for the public: By using one of our public
+ licenses, a licensor grants the public permission to use the
+ licensed material under specified terms and conditions. If
+ the licensor's permission is not necessary for any reason--for
+ example, because of any applicable exception or limitation to
+ copyright--then that use is not regulated by the license. Our
+ licenses grant only permissions under copyright and certain
+ other rights that a licensor has authority to grant. Use of
+ the licensed material may still be restricted for other
+ reasons, including because others have copyright or other
+ rights in the material. A licensor may make special requests,
+ such as asking that all changes be marked or described.
+ Although not required by our licenses, you are encouraged to
+ respect those requests where reasonable. More_considerations
+ for the public:
+ wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+Section 1 -- Definitions.
+
+ a. Adapted Material means material subject to Copyright and Similar
+ Rights that is derived from or based upon the Licensed Material
+ and in which the Licensed Material is translated, altered,
+ arranged, transformed, or otherwise modified in a manner requiring
+ permission under the Copyright and Similar Rights held by the
+ Licensor. For purposes of this Public License, where the Licensed
+ Material is a musical work, performance, or sound recording,
+ Adapted Material is always produced where the Licensed Material is
+ synched in timed relation with a moving image.
+
+ b. Adapter's License means the license You apply to Your Copyright
+ and Similar Rights in Your contributions to Adapted Material in
+ accordance with the terms and conditions of this Public License.
+
+ c. Copyright and Similar Rights means copyright and/or similar rights
+ closely related to copyright including, without limitation,
+ performance, broadcast, sound recording, and Sui Generis Database
+ Rights, without regard to how the rights are labeled or
+ categorized. For purposes of this Public License, the rights
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
+ Rights.
+ d. Effective Technological Measures means those measures that, in the
+ absence of proper authority, may not be circumvented under laws
+ fulfilling obligations under Article 11 of the WIPO Copyright
+ Treaty adopted on December 20, 1996, and/or similar international
+ agreements.
+
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
+ any other exception or limitation to Copyright and Similar Rights
+ that applies to Your use of the Licensed Material.
+
+ f. Licensed Material means the artistic or literary work, database,
+ or other material to which the Licensor applied this Public
+ License.
+
+ g. Licensed Rights means the rights granted to You subject to the
+ terms and conditions of this Public License, which are limited to
+ all Copyright and Similar Rights that apply to Your use of the
+ Licensed Material and that the Licensor has authority to license.
+
+ h. Licensor means the individual(s) or entity(ies) granting rights
+ under this Public License.
+
+ i. NonCommercial means not primarily intended for or directed towards
+ commercial advantage or monetary compensation. For purposes of
+ this Public License, the exchange of the Licensed Material for
+ other material subject to Copyright and Similar Rights by digital
+ file-sharing or similar means is NonCommercial provided there is
+ no payment of monetary compensation in connection with the
+ exchange.
+
+ j. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such
+ as reproduction, public display, public performance, distribution,
+ dissemination, communication, or importation, and to make material
+ available to the public including in ways that members of the
+ public may access the material from a place and at a time
+ individually chosen by them.
+
+ k. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and of
+ the Council of 11 March 1996 on the legal protection of databases,
+ as amended and/or succeeded, as well as other essentially
+ equivalent rights anywhere in the world.
+
+ l. You means the individual or entity exercising the Licensed Rights
+ under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+ a. License grant.
+
+ 1. Subject to the terms and conditions of this Public License,
+ the Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+
+ a. reproduce and Share the Licensed Material, in whole or
+ in part, for NonCommercial purposes only; and
+
+ b. produce, reproduce, and Share Adapted Material for
+ NonCommercial purposes only.
+
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with
+ its terms and conditions.
+
+ 3. Term. The term of this Public License is specified in Section
+ 6(a).
+
+ 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter created,
+ and to make technical modifications necessary to do so. The
+ Licensor waives and/or agrees not to assert any right or
+ authority to forbid You from making technical modifications
+ necessary to exercise the Licensed Rights, including
+ technical modifications necessary to circumvent Effective
+ Technological Measures. For purposes of this Public License,
+ simply making modifications authorized by this Section 2(a)
+ (4) never produces Adapted Material.
+
+ 5. Downstream recipients.
+
+ a. Offer from the Licensor -- Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+
+ b. No downstream restrictions. You may not offer or impose
+ any additional or different terms or conditions on, or
+ apply any Effective Technological Measures to, the
+ Licensed Material if doing so restricts exercise of the
+ Licensed Rights by any recipient of the Licensed
+ Material.
+
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You
+ are, or that Your use of the Licensed Material is, connected
+ with, or sponsored, endorsed, or granted official status by,
+ the Licensor or others designated to receive attribution as
+ provided in Section 3(a)(1)(A)(i).
+
+ b. Other rights.
+
+ 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however, to
+ the extent possible, the Licensor waives and/or agrees not to
+ assert any such rights held by the Licensor to the limited
+ extent necessary to allow You to exercise the Licensed
+ Rights, but not otherwise.
+
+ 2. Patent and trademark rights are not licensed under this
+ Public License.
+
+ 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties, including when
+ the Licensed Material is used other than for NonCommercial
+ purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+ a. Attribution.
+
+ 1. If You Share the Licensed Material (including in modified
+ form), You must:
+
+ a. retain the following if it is supplied by the Licensor
+ with the Licensed Material:
+
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if
+ designated);
+
+ ii. a copyright notice;
+
+ iii. a notice that refers to this Public License;
+
+ iv. a notice that refers to the disclaimer of
+ warranties;
+
+ v. a URI or hyperlink to the Licensed Material to the
+ extent reasonably practicable;
+
+ b. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+
+ c. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required
+ information.
+
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+
+ 4. If You Share Adapted Material You produce, the Adapter's
+ License You apply must not prevent recipients of the Adapted
+ Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+ to extract, reuse, reproduce, and Share all or a substantial
+ portion of the contents of the database for NonCommercial purposes
+ only;
+
+ b. if You include all or a substantial portion of the database
+ contents in a database in which You have Sui Generis Database
+ Rights, then the database in which You have Sui Generis Database
+ Rights (but not its individual contents) is Adapted Material; and
+
+ c. You must comply with the conditions in Section 3(a) if You Share
+ all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+ c. The disclaimer of warranties and limitation of liability provided
+ above shall be interpreted in a manner that, to the extent
+ possible, most closely approximates an absolute disclaimer and
+ waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+ a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply with
+ this Public License, then Your rights under this Public License
+ terminate automatically.
+
+ b. Where Your right to use the Licensed Material has terminated under
+ Section 6(a), it reinstates:
+
+ 1. automatically as of the date the violation is cured, provided
+ it is cured within 30 days of Your discovery of the
+ violation; or
+
+ 2. upon express reinstatement by the Licensor.
+
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+
+ c. For the avoidance of doubt, the Licensor may also offer the
+ Licensed Material under separate terms or conditions or stop
+ distributing the Licensed Material at any time; however, doing so
+ will not terminate this Public License.
+
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+ License.
+
+Section 7 -- Other Terms and Conditions.
+
+ a. The Licensor shall not be bound by any additional or different
+ terms or conditions communicated by You unless expressly agreed.
+
+ b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+ a. For the avoidance of doubt, this Public License does not, and
+ shall not be interpreted to, reduce, limit, restrict, or impose
+ conditions on any use of the Licensed Material that could lawfully
+ be made without permission under this Public License.
+
+ b. To the extent possible, if any provision of this Public License is
+ deemed unenforceable, it shall be automatically reformed to the
+ minimum extent necessary to make it enforceable. If the provision
+ cannot be reformed, it shall be severed from this Public License
+ without affecting the enforceability of the remaining terms and
+ conditions.
+
+ c. No term or condition of this Public License will be waived and no
+ failure to comply consented to unless expressly agreed to by the
+ Licensor.
+
+ d. Nothing in this Public License constitutes or may be interpreted
+ as a limitation upon, or waiver of, any privileges and immunities
+ that apply to the Licensor or You, including from the legal
+ processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a4094cb34494c0386cafb22f3c07d45b40f6bca
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md
@@ -0,0 +1,201 @@
+# Model Card for DINOv2-S/B/L/g
+
+These are Vision Transformer models trained following the method described in the paper:
+"DINOv2: Learning Robust Visual Features without Supervision"
+
+We provide 4 models: 1 ViT-g trained from scratch, and 3 ViT-S/B/L models distilled from the ViT-g.
+
+## Model Details
+The model takes an image as input and returns a class token and patch tokens.
+
+The embedding dimension is:
+- 384 for ViT-S.
+- 768 for ViT-B.
+- 1024 for ViT-L.
+- 1536 for ViT-g.
+
+The models follow a Transformer architecture, with a patch size of 14.
+
+For a 224x224 image, this results in 1 class token + 256 patch tokens.
+
+The models can accept larger images provided the image shapes are multiples of the patch size (14).
+If this condition is not verified, the model will crop to the closest smaller multiple of the patch size.
+
+### Model Description
+
+- **Developed by:** Meta AI
+- **Model type:** Vision Transformer
+- **License:** CC-BY-NC
+
+- **Repository:** https://github.com/facebookresearch/dinov2
+- **Paper:** https://arxiv.org/abs/2304.07193
+- **Demo:** https://dinov2.metademolab.com/
+
+## Uses
+
+The models are vision backbones providing multi-purpose features for downstream tasks.
+
+### Direct Use
+
+The models can be used without fine-tuning, with downstream classifiers as simple as linear layers, to obtain competitive results:
+- on depth estimation, semantic segmentation, using linear layers.
+- on image classification, using k-NN classifiers on the class token.
+- on image classification, with logistic regression classifiers applied on the class token.
+- on image classification, with a linear layer applied on the class token and the average of the patch tokens.
+- on image retrieval using nearest neighbors.
+
+### Downstream Use
+
+It is technically possible to perform fine-tuning on the models, for small gains (we measured +2% on ImageNet-1k classification).
+We recommend keeping this as a very last step and only when necessary, as the features already provide good performance out-of-the-box.
+
+## Bias, Risks, and Limitations
+
+Despite improvements thanks to the training method not using annotations, we still observe significant biases in our models toward rich households from Western countries.
+
+### Recommendations
+
+We expect fine-tuning will increase the biases in the features produced by the model as they will be tuned to the fine-tuning labels.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+```python
+import torch
+dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
+dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
+dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
+dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
+```
+
+## Training Details
+
+### Training Data
+
+- **Training data:** LVD-142M (see paper)
+- **Training regime:** fp16 using PyTorch-FSDP mixed-precision.
+
+### Training Procedure
+
+- **Training objective:**
+ - DINO self-distillation loss with multi-crop
+ - iBOT masked-image modeling loss
+ - KoLeo regularization on [CLS] tokens
+- **Architectures:**
+ - ViT-S (21M params): Patch size 14, embedding dimension 384, 6 heads, MLP FFN
+ - ViT-B (86M params): Patch size 14, embedding dimension 768, 12 heads, MLP FFN
+ - ViT-L (0.3B params): Patch size 14, embedding dimension 1024, 16 heads, MLP FFN
+ - ViT-g (1.1B params): Patch size 14, embedding dimension 1536, 24 heads, SwiGLU FFN
+- **Distillation:**
+ - Distillation follows the standard DINOv2 pretraining procedure, except the teacher is a pretrained ViT-g, frozen.
+
+## Evaluation
+
+We refer users to the associated paper for the evaluation protocols.
+
+
+
+ model
+ ImageNet-1k
+ NYU-Depth v2
+ SUN-RGBD
+ ADE20k
+ iNaturalist 2018
+ Oxford-H
+
+
+ task
+ classif. (acc)
+ classif. (acc)
+ classif. V2 (acc)
+ depth (RMSE)
+ depth (RMSE)
+ segm. (mAP)
+ classif. (acc)
+ retrieval (mAP)
+
+
+
+ k-NN
+ linear
+ linear
+ linear 4 layers
+ NYU-D transfer
+ multiscale
+ linear
+ nearest neighbor
+
+
+ ViT-S/14
+ 79.0%
+ 81.1%
+ 70.8%
+ 0.417
+ 0.431
+ 47.2
+ 69.5%
+ 43.2
+
+
+ ViT-B/14
+ 82.1%
+ 84.5%
+ 74.9%
+ 0.362
+ 0.400
+ 51.3
+ 76.3%
+ 49.5
+
+
+ ViT-L/14
+ 83.5%
+ 86.3%
+ 77.6%
+ 0.333
+ 0.396
+ 53.1
+ 79.8%
+ 54.0
+
+
+ ViT-g/14
+ 83.5%
+ 86.5%
+ 78.4%
+ 0.298
+ 0.362
+ 53.0
+ 81.6%
+ 52.3
+
+
+
+## Environmental Impact
+
+- **Hardware Type:** Nvidia A100
+- **Hours used:** 22,000 for ViT-g, 4,500 for ViT-S distillation, 5,300 for ViT-B distillation, 8,000 for ViT-L distillation
+- **Cloud Provider:** Private infra
+- **Compute Region:** USA
+- **Carbon Emitted:** 7t CO2eq
+
+#### Hardware
+
+Nvidia A100 GPUs
+
+#### Software
+
+PyTorch 2.0,
+xFormers 0.0.18
+
+**BibTeX**
+
+```
+@misc{oquab2023dinov2,
+ title={DINOv2: Learning Robust Visual Features without Supervision},
+ author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+ journal={arXiv:2304.07193},
+ year={2023}
+}
+```
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/README.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb83cc92314291e77dc8c72e7d91809c88baf056
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/README.md
@@ -0,0 +1,277 @@
+# DINOv2: Learning Robust Visual Features without Supervision
+
+**[Meta AI Research, FAIR](https://ai.facebook.com/research/)**
+
+Maxime Oquab,
+Timothée Darcet,
+Théo Moutakanni,
+Huy V. Vo,
+Marc Szafraniec,
+Vasil Khalidov,
+Patrick Labatut,
+Armand Joulin,
+Piotr Bojanowski
+
+[[`Paper`](https://arxiv.org/abs/2304.07193)] [[`Blog`](https://ai.facebook.com/blog/dino-v2-computer-vision-self-supervised-learning/)] [[`Demo`](https://dinov2.metademolab.com)] [[`BibTeX`](#citing-dinov2)]
+
+PyTorch implementation and pretrained models for DINOv2. For details, see the paper: **[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)**.
+
+DINOv2 models produce high-performance visual features that can be directly employed with classifiers as simple as linear layers on a variety of computer vision tasks; these visual features are robust and perform well across domains without any requirement for fine-tuning. The models were pretrained on a dataset of 142 M images without using any labels or annotations.
+
+https://github.com/facebookresearch/dinov2/assets/60359573/f168823e-7922-415a-b429-578badf5c356
+
+
+ Visualization of the three first principal components of the patch features of all frames, mapped to RGB values.
+
+
+## Pretrained models
+
+
+
+ model
+ # of params
+ ImageNet k-NN
+ ImageNet linear
+ download
+
+
+ ViT-S/14 distilled
+ 21 M
+ 79.0%
+ 81.1%
+ backbone only
+
+
+ ViT-B/14 distilled
+ 86 M
+ 82.1%
+ 84.5%
+ backbone only
+
+
+ ViT-L/14 distilled
+ 300 M
+ 83.5%
+ 86.3%
+ backbone only
+
+
+ ViT-g/14
+ 1,100 M
+ 83.5%
+ 86.5%
+ backbone only
+
+
+
+### Pretrained models via PyTorch Hub
+
+Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install PyTorch (the only required dependency for loading the model). Installing PyTorch with CUDA support is strongly recommended.
+
+A corresponding [model card](MODEL_CARD.md) is included in the repository.
+
+```python
+import torch
+
+dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
+dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
+dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
+dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
+```
+
+## Installation
+
+The training and evaluation code requires PyTorch 2.0 and [xFormers](https://github.com/facebookresearch/xformers) 0.0.18 as well as a number of other 3rd party packages. Note that the code has only been tested with the specified versions and also expects a Linux environment. To setup all the required dependencies for training and evaluation, please follow the instructions below:
+
+*[conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html)* **(Recommended)** - Clone the repository and then create and activate a `dinov2` conda environment using the provided environment definition:
+
+```shell
+conda env create -f conda.yaml
+conda activate dinov2
+```
+
+*[pip](https://pip.pypa.io/en/stable/getting-started/)* - Clone the repository and then use the provided `requirements.txt` to install the dependencies:
+
+```shell
+pip install -r requirements.txt
+```
+
+## Data preparation
+
+### ImageNet-1k
+
+The root directory of the dataset should hold the following contents:
+
+- `/test/ILSVRC2012_test_00000001.JPEG`
+- `/test/[..]`
+- `/test/ILSVRC2012_test_00100000.JPEG`
+- `/train/n01440764/n01440764_10026.JPEG`
+- `/train/[...]`
+- `/train/n15075141/n15075141_9993.JPEG`
+- `/val/n01440764/ILSVRC2012_val_00000293.JPEG`
+- `/val/[...]`
+- `/val/n15075141/ILSVRC2012_val_00049174.JPEG`
+- `/labels.txt`
+
+The provided dataset implementation expects a few additional metadata files to be present under the extra directory:
+
+- `/class-ids-TRAIN.npy`
+- `/class-ids-VAL.npy`
+- `/class-names-TRAIN.npy`
+- `/class-names-VAL.npy`
+- `/entries-TEST.npy`
+- `/entries-TRAIN.npy`
+- `/entries-VAL.npy`
+
+These metadata files can be generated (once) with the following lines of Python code:
+
+```python
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data.datasets import ImageNet
+
+for split in ImageNet.Split:
+ dataset = ImageNet(split=split, root="", extra="")
+ dataset.dump_extra()
+```
+
+Note that the root and extra directories do not have to be distinct directories.
+
+### ImageNet-22k
+
+Please adapt the [dataset class](dinov2/data/datasets/image_net_22k.py) to match your local setup.
+
+
+
+:warning: To execute the commands provided in the next sections for training and evaluation, the `dinov2` package should be included in the Python module search path, i.e. simply prefix the command to run with `PYTHONPATH=.`.
+
+## Training
+
+### Fast setup: training DINOv2 ViT-L/16 on ImageNet-1k
+
+Run DINOv2 training on 4 A100-80GB nodes (32 GPUs) in a SLURM cluster environment with submitit:
+
+```shell
+python dinov2/run/train/train.py \
+ --nodes 4 \
+ --config-file dinov2/configs/train/vitl16_short.yaml \
+ --output-dir \
+ train.dataset_path=ImageNet:split=TRAIN:root=:extra=
+```
+
+Training time is approximately 1 day and the resulting checkpoint should reach 81.6% on k-NN eval and 82.9% on linear eval.
+
+The training code saves the weights of the teacher in the `eval` folder every 12500 iterations for evaluation.
+
+### Long setup: training DINOv2 ViT-L/14 on ImageNet-22k
+
+Run DINOv2 training on 12 A100-80GB nodes (96 GPUs) in a SLURM cluster environment with submitit:
+
+```shell
+python dinov2/run/train/train.py \
+ --nodes 12 \
+ --config-file dinov2/configs/train/vitl14.yaml \
+ --output-dir \
+ train.dataset_path=ImageNet22k:root=:extra=
+```
+
+Training time is approximately 3.3 days and the resulting checkpoint should reach 82.0% on k-NN eval and 84.5% on linear eval.
+
+The training code saves the weights of the teacher in the `eval` folder every 12500 iterations for evaluation.
+
+
+## Evaluation
+
+The training code regularly saves the teacher weights. In order to evaluate the model, run the following evaluation on a single node:
+
+### k-NN classification on ImageNet-1k
+
+```shell
+python dinov2/run/eval/knn.py \
+ --config-file /config.yaml \
+ --pretrained-weights /eval/training_24999/teacher_checkpoint.pth \
+ --output-dir /eval/training_24999/knn \
+ --train-dataset ImageNet:split=TRAIN:root=:extra= \
+ --val-dataset ImageNet:split=VAL:root=:extra=
+```
+
+### Logistic regression classification on ImageNet-1k
+
+```shell
+python dinov2/run/eval/log_regression.py \
+ --config-file /config.yaml \
+ --pretrained-weights /eval/training_24999/teacher_checkpoint.pth \
+ --output-dir /eval/training_24999/logreg \
+ --train-dataset ImageNet:split=TRAIN:root=:extra= \
+ --val-dataset ImageNet:split=VAL:root=:extra=
+```
+
+### Linear classification with data augmentation on ImageNet-1k
+
+```shell
+python dinov2/run/eval/linear.py \
+ --config-file /config.yaml \
+ --pretrained-weights /eval/training_24999/teacher_checkpoint.pth \
+ --output-dir /eval/training_24999/linear \
+ --train-dataset ImageNet:split=TRAIN:root=:extra= \
+ --val-dataset ImageNet:split=VAL:root=:extra=
+```
+
+We release the weights from evaluating the different models:
+
+
+
+The performance of the provided pretrained model weights can be evaluated as follows on ImageNet-1k:
+
+```shell
+python dinov2/run/eval/linear.py \
+ --config-file dinov2/configs/eval/vitg14_pretrain.yaml \
+ --pretrained-weights https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pth \
+ --train-dataset ImageNet:split=TRAIN:root=:extra= \
+ --val-dataset ImageNet:split=VAL:root=:extra=
+```
+
+## License
+
+DINOv2 code and model weights are released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details.
+
+## Contributing
+
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+
+## Citing DINOv2
+
+If you find this repository useful, please consider giving a star :star: and citation :t-rex::
+
+```
+@misc{oquab2023dinov2,
+ title={DINOv2: Learning Robust Visual Features without Supervision},
+ author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy V. and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+ journal={arXiv:2304.07193},
+ year={2023}
+}
+```
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/conda.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/conda.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abe596b48aad36b371d572ce052932cf4ad400b0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/conda.yaml
@@ -0,0 +1,22 @@
+name: dinov2
+channels:
+ - defaults
+ - pytorch
+ - nvidia
+ - xformers
+ - conda-forge
+dependencies:
+ - python=3.9
+ - pytorch::pytorch=2.0.0
+ - pytorch::pytorch-cuda=11.7.0
+ - pytorch::torchvision=0.15.0
+ - omegaconf
+ - torchmetrics=0.10.3
+ - fvcore
+ - iopath
+ - xformers::xformers=0.0.18
+ - pip
+ - pip:
+ - git+https://github.com/facebookincubator/submitit
+ - --extra-index-url https://pypi.nvidia.com
+ - cuml-cu11
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f3f36b4f8ca46b6e37cbb90f0a15196b252c051
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+__version__ = "0.0.1"
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aef2b7ad4b16fb4c05379dacac80991123f7374
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pathlib
+
+from omegaconf import OmegaConf
+
+
+def load_config(config_name: str):
+ config_filename = config_name + ".yaml"
+ return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
+
+
+dinov2_default_config = load_config("ssl_default_config")
+
+
+def load_and_merge_config(config_name: str):
+ default_config = OmegaConf.create(dinov2_default_config)
+ loaded_config = load_config(config_name)
+ return OmegaConf.merge(default_config, loaded_config)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43725df384813c3be3342488017f51e161e8ea9d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+ arch: vit_base
+ patch_size: 14
+crops:
+ global_crops_size: 518 # this is to set up the position embeddings properly
+ local_crops_size: 98
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b1066fcdaf5ae633b20e193505049ed9641c173
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml
@@ -0,0 +1,7 @@
+student:
+ arch: vit_giant2
+ patch_size: 14
+ ffn_layer: swiglufused
+crops:
+ global_crops_size: 518 # this is to set up the position embeddings properly
+ local_crops_size: 98
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1df9fb1b902f727612fcf8abca532ca95fd2a4b0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+ arch: vit_large
+ patch_size: 14
+crops:
+ global_crops_size: 518 # this is to set up the position embeddings properly
+ local_crops_size: 98
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..043d428922b6d1d9b33e7637dd97d9290f98ee66
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+ arch: vit_small
+ patch_size: 14
+crops:
+ global_crops_size: 518 # this is to set up the position embeddings properly
+ local_crops_size: 98
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6ab33f05bf15e5d143dd08e770010ac9f5a79a2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml
@@ -0,0 +1,115 @@
+MODEL:
+ WEIGHTS: ''
+compute_precision:
+ grad_scaler: true
+ teacher:
+ backbone:
+ sharding_strategy: SHARD_GRAD_OP
+ mixed_precision:
+ param_dtype: fp16
+ reduce_dtype: fp16
+ buffer_dtype: fp32
+ dino_head:
+ sharding_strategy: SHARD_GRAD_OP
+ mixed_precision:
+ param_dtype: fp16
+ reduce_dtype: fp16
+ buffer_dtype: fp32
+ ibot_head:
+ sharding_strategy: SHARD_GRAD_OP
+ mixed_precision:
+ param_dtype: fp16
+ reduce_dtype: fp16
+ buffer_dtype: fp32
+ student:
+ backbone:
+ sharding_strategy: SHARD_GRAD_OP
+ mixed_precision:
+ param_dtype: fp16
+ reduce_dtype: fp16
+ buffer_dtype: fp32
+ dino_head:
+ sharding_strategy: SHARD_GRAD_OP
+ mixed_precision:
+ param_dtype: fp16
+ reduce_dtype: fp32
+ buffer_dtype: fp32
+ ibot_head:
+ sharding_strategy: SHARD_GRAD_OP
+ mixed_precision:
+ param_dtype: fp16
+ reduce_dtype: fp32
+ buffer_dtype: fp32
+dino:
+ loss_weight: 1.0
+ head_n_prototypes: 65536
+ head_bottleneck_dim: 256
+ head_nlayers: 3
+ head_hidden_dim: 2048
+ koleo_loss_weight: 0.1
+ibot:
+ loss_weight: 1.0
+ mask_sample_probability: 0.5
+ mask_ratio_min_max:
+ - 0.1
+ - 0.5
+ separate_head: false
+ head_n_prototypes: 65536
+ head_bottleneck_dim: 256
+ head_nlayers: 3
+ head_hidden_dim: 2048
+train:
+ batch_size_per_gpu: 64
+ dataset_path: ImageNet:split=TRAIN
+ output_dir: .
+ saveckp_freq: 20
+ seed: 0
+ num_workers: 10
+ OFFICIAL_EPOCH_LENGTH: 1250
+ cache_dataset: true
+ centering: "centering" # or "sinkhorn_knopp"
+student:
+ arch: vit_large
+ patch_size: 16
+ drop_path_rate: 0.3
+ layerscale: 1.0e-05
+ drop_path_uniform: true
+ pretrained_weights: ''
+ ffn_layer: "mlp"
+ block_chunks: 0
+ qkv_bias: true
+ proj_bias: true
+ ffn_bias: true
+teacher:
+ momentum_teacher: 0.992
+ final_momentum_teacher: 1
+ warmup_teacher_temp: 0.04
+ teacher_temp: 0.07
+ warmup_teacher_temp_epochs: 30
+optim:
+ epochs: 100
+ weight_decay: 0.04
+ weight_decay_end: 0.4
+ base_lr: 0.004 # learning rate for a batch size of 1024
+ lr: 0. # will be set after applying scaling rule
+ warmup_epochs: 10
+ min_lr: 1.0e-06
+ clip_grad: 3.0
+ freeze_last_layer_epochs: 1
+ scaling_rule: sqrt_wrt_1024
+ patch_embed_lr_mult: 0.2
+ layerwise_decay: 0.9
+ adamw_beta1: 0.9
+ adamw_beta2: 0.999
+crops:
+ global_crops_scale:
+ - 0.32
+ - 1.0
+ local_crops_number: 8
+ local_crops_scale:
+ - 0.05
+ - 0.32
+ global_crops_size: 224
+ local_crops_size: 96
+evaluation:
+ eval_period_iterations: 12500
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09475150a1aa6c356bb9ed6c6933a6f226a2d032
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml
@@ -0,0 +1,26 @@
+dino:
+ head_n_prototypes: 131072
+ head_bottleneck_dim: 384
+ibot:
+ separate_head: true
+ head_n_prototypes: 131072
+train:
+ batch_size_per_gpu: 12
+ dataset_path: ImageNet22k
+ centering: sinkhorn_knopp
+student:
+ arch: vit_giant2
+ patch_size: 14
+ drop_path_rate: 0.4
+ ffn_layer: swiglufused
+ block_chunks: 4
+teacher:
+ momentum_teacher: 0.994
+optim:
+ epochs: 500
+ weight_decay_end: 0.2
+ base_lr: 2.0e-04 # learning rate for a batch size of 1024
+ warmup_epochs: 80
+ layerwise_decay: 1.0
+crops:
+ local_crops_size: 98
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e23046c457e572545908817ef2cb4ba46dc5b9a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml
@@ -0,0 +1,26 @@
+dino:
+ head_n_prototypes: 131072
+ head_bottleneck_dim: 384
+ibot:
+ separate_head: true
+ head_n_prototypes: 131072
+train:
+ batch_size_per_gpu: 32
+ dataset_path: ImageNet22k
+ centering: sinkhorn_knopp
+student:
+ arch: vit_large
+ patch_size: 14
+ drop_path_rate: 0.4
+ ffn_layer: swiglufused
+ block_chunks: 4
+teacher:
+ momentum_teacher: 0.994
+optim:
+ epochs: 500
+ weight_decay_end: 0.2
+ base_lr: 2.0e-04 # learning rate for a batch size of 1024
+ warmup_epochs: 80
+ layerwise_decay: 1.0
+crops:
+ local_crops_size: 98
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57a1ccc912602ea2bb10f4eb36899d82db49ef88
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml
@@ -0,0 +1,6 @@
+# this corresponds to the default config
+train:
+ dataset_path: ImageNet:split=TRAIN
+ batch_size_per_gpu: 64
+student:
+ block_chunks: 4
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f83931025d8b4beeba39f8b23066f1cfa7970823
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .adapters import DatasetWithEnumeratedTargets
+from .loaders import make_data_loader, make_dataset, SamplerType
+from .collate import collate_data_and_cast
+from .masking import MaskingGenerator
+from .augmentations import DataAugmentationDINO
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..76112995039b7e8af135d836950f58e7cbc32e99
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torch.utils.data import Dataset
+
+
+class DatasetWithEnumeratedTargets(Dataset):
+ def __init__(self, dataset):
+ self._dataset = dataset
+
+ def get_image_data(self, index: int) -> bytes:
+ return self._dataset.get_image_data(index)
+
+ def get_target(self, index: int) -> Tuple[Any, int]:
+ target = self._dataset.get_target(index)
+ return (index, target)
+
+ def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
+ image, target = self._dataset[index]
+ target = index if target is None else target
+ return image, (index, target)
+
+ def __len__(self) -> int:
+ return len(self._dataset)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ee2691d8c1b5a6e799386b3b4ae7260a516038
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from torchvision import transforms
+
+from .transforms import (
+ GaussianBlur,
+ make_normalize_transform,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+class DataAugmentationDINO(object):
+ def __init__(
+ self,
+ global_crops_scale,
+ local_crops_scale,
+ local_crops_number,
+ global_crops_size=224,
+ local_crops_size=96,
+ ):
+ self.global_crops_scale = global_crops_scale
+ self.local_crops_scale = local_crops_scale
+ self.local_crops_number = local_crops_number
+ self.global_crops_size = global_crops_size
+ self.local_crops_size = local_crops_size
+
+ logger.info("###################################")
+ logger.info("Using data augmentation parameters:")
+ logger.info(f"global_crops_scale: {global_crops_scale}")
+ logger.info(f"local_crops_scale: {local_crops_scale}")
+ logger.info(f"local_crops_number: {local_crops_number}")
+ logger.info(f"global_crops_size: {global_crops_size}")
+ logger.info(f"local_crops_size: {local_crops_size}")
+ logger.info("###################################")
+
+ # random resized crop and flip
+ self.geometric_augmentation_global = transforms.Compose(
+ [
+ transforms.RandomResizedCrop(
+ global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+ ),
+ transforms.RandomHorizontalFlip(p=0.5),
+ ]
+ )
+
+ self.geometric_augmentation_local = transforms.Compose(
+ [
+ transforms.RandomResizedCrop(
+ local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+ ),
+ transforms.RandomHorizontalFlip(p=0.5),
+ ]
+ )
+
+ # color distorsions / blurring
+ color_jittering = transforms.Compose(
+ [
+ transforms.RandomApply(
+ [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+ p=0.8,
+ ),
+ transforms.RandomGrayscale(p=0.2),
+ ]
+ )
+
+ global_transfo1_extra = GaussianBlur(p=1.0)
+
+ global_transfo2_extra = transforms.Compose(
+ [
+ GaussianBlur(p=0.1),
+ transforms.RandomSolarize(threshold=128, p=0.2),
+ ]
+ )
+
+ local_transfo_extra = GaussianBlur(p=0.5)
+
+ # normalization
+ self.normalize = transforms.Compose(
+ [
+ transforms.ToTensor(),
+ make_normalize_transform(),
+ ]
+ )
+
+ self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
+ self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
+ self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
+
+ def __call__(self, image):
+ output = {}
+
+ # global crops:
+ im1_base = self.geometric_augmentation_global(image)
+ global_crop_1 = self.global_transfo1(im1_base)
+
+ im2_base = self.geometric_augmentation_global(image)
+ global_crop_2 = self.global_transfo2(im2_base)
+
+ output["global_crops"] = [global_crop_1, global_crop_2]
+
+ # global crops for teacher:
+ output["global_crops_teacher"] = [global_crop_1, global_crop_2]
+
+ # local crops:
+ local_crops = [
+ self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
+ ]
+ output["local_crops"] = local_crops
+ output["offsets"] = ()
+
+ return output
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d5ea0fc89cd30037c7471acf975964a755758ee
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import random
+
+
+def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
+ # dtype = torch.half # TODO: Remove
+
+ n_global_crops = len(samples_list[0][0]["global_crops"])
+ n_local_crops = len(samples_list[0][0]["local_crops"])
+
+ collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
+
+ collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
+
+ B = len(collated_global_crops)
+ N = n_tokens
+ n_samples_masked = int(B * mask_probability)
+ probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
+ upperbound = 0
+ masks_list = []
+ for i in range(0, n_samples_masked):
+ prob_min = probs[i]
+ prob_max = probs[i + 1]
+ masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
+ upperbound += int(N * prob_max)
+ for i in range(n_samples_masked, B):
+ masks_list.append(torch.BoolTensor(mask_generator(0)))
+
+ random.shuffle(masks_list)
+
+ collated_masks = torch.stack(masks_list).flatten(1)
+ mask_indices_list = collated_masks.flatten().nonzero().flatten()
+
+ masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
+
+ return {
+ "collated_global_crops": collated_global_crops.to(dtype),
+ "collated_local_crops": collated_local_crops.to(dtype),
+ "collated_masks": collated_masks,
+ "mask_indices_list": mask_indices_list,
+ "masks_weight": masks_weight,
+ "upperbound": upperbound,
+ "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
+ }
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e959aff998ff577a6ea73e81931817a7854b883
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .image_net import ImageNet
+from .image_net_22k import ImageNet22k
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5ee2b0408f2b533df6bd5cb65450b1947a47582
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from io import BytesIO
+from typing import Any
+
+from PIL import Image
+
+
+class Decoder:
+ def decode(self) -> Any:
+ raise NotImplementedError
+
+
+class ImageDataDecoder(Decoder):
+ def __init__(self, image_data: bytes) -> None:
+ self._image_data = image_data
+
+ def decode(self) -> Image:
+ f = BytesIO(self._image_data)
+ return Image.open(f).convert(mode="RGB")
+
+
+class TargetDecoder(Decoder):
+ def __init__(self, target: Any):
+ self._target = target
+
+ def decode(self) -> Any:
+ return self._target
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
new file mode 100644
index 0000000000000000000000000000000000000000..f21e895f88a61d00be6e3f8c57eb8395f2f8b3e3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torchvision.datasets import VisionDataset
+
+from .decoders import TargetDecoder, ImageDataDecoder
+
+
+class ExtendedVisionDataset(VisionDataset):
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs) # type: ignore
+
+ def get_image_data(self, index: int) -> bytes:
+ raise NotImplementedError
+
+ def get_target(self, index: int) -> Any:
+ raise NotImplementedError
+
+ def __getitem__(self, index: int) -> Tuple[Any, Any]:
+ try:
+ image_data = self.get_image_data(index)
+ image = ImageDataDecoder(image_data).decode()
+ except Exception as e:
+ raise RuntimeError(f"can not read image for sample {index}") from e
+ target = self.get_target(index)
+ target = TargetDecoder(target).decode()
+
+ if self.transforms is not None:
+ image, target = self.transforms(image, target)
+
+ return image, target
+
+ def __len__(self) -> int:
+ raise NotImplementedError
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..82a3747749af2c24db7f6eb4bd13988231abd781
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
@@ -0,0 +1,291 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+from enum import Enum
+import logging
+import os
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+logger = logging.getLogger("dinov2")
+_Target = int
+
+
+class _Split(Enum):
+ TRAIN = "train"
+ VAL = "val"
+ TEST = "test" # NOTE: torchvision does not support the test split
+
+ @property
+ def length(self) -> int:
+ split_lengths = {
+ _Split.TRAIN: 1_281_167,
+ _Split.VAL: 50_000,
+ _Split.TEST: 100_000,
+ }
+ return split_lengths[self]
+
+ def get_dirname(self, class_id: Optional[str] = None) -> str:
+ return self.value if class_id is None else os.path.join(self.value, class_id)
+
+ def get_image_relpath(self, actual_index: int, class_id: Optional[str] = None) -> str:
+ dirname = self.get_dirname(class_id)
+ if self == _Split.TRAIN:
+ basename = f"{class_id}_{actual_index}"
+ else: # self in (_Split.VAL, _Split.TEST):
+ basename = f"ILSVRC2012_{self.value}_{actual_index:08d}"
+ return os.path.join(dirname, basename + ".JPEG")
+
+ def parse_image_relpath(self, image_relpath: str) -> Tuple[str, int]:
+ assert self != _Split.TEST
+ dirname, filename = os.path.split(image_relpath)
+ class_id = os.path.split(dirname)[-1]
+ basename, _ = os.path.splitext(filename)
+ actual_index = int(basename.split("_")[-1])
+ return class_id, actual_index
+
+
+class ImageNet(ExtendedVisionDataset):
+ Target = Union[_Target]
+ Split = Union[_Split]
+
+ def __init__(
+ self,
+ *,
+ split: "ImageNet.Split",
+ root: str,
+ extra: str,
+ transforms: Optional[Callable] = None,
+ transform: Optional[Callable] = None,
+ target_transform: Optional[Callable] = None,
+ ) -> None:
+ super().__init__(root, transforms, transform, target_transform)
+ self._extra_root = extra
+ self._split = split
+
+ self._entries = None
+ self._class_ids = None
+ self._class_names = None
+
+ @property
+ def split(self) -> "ImageNet.Split":
+ return self._split
+
+ def _get_extra_full_path(self, extra_path: str) -> str:
+ return os.path.join(self._extra_root, extra_path)
+
+ def _load_extra(self, extra_path: str) -> np.ndarray:
+ extra_full_path = self._get_extra_full_path(extra_path)
+ return np.load(extra_full_path, mmap_mode="r")
+
+ def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+ extra_full_path = self._get_extra_full_path(extra_path)
+ os.makedirs(self._extra_root, exist_ok=True)
+ np.save(extra_full_path, extra_array)
+
+ @property
+ def _entries_path(self) -> str:
+ return f"entries-{self._split.value.upper()}.npy"
+
+ @property
+ def _class_ids_path(self) -> str:
+ return f"class-ids-{self._split.value.upper()}.npy"
+
+ @property
+ def _class_names_path(self) -> str:
+ return f"class-names-{self._split.value.upper()}.npy"
+
+ def _get_entries(self) -> np.ndarray:
+ if self._entries is None:
+ self._entries = self._load_extra(self._entries_path)
+ assert self._entries is not None
+ return self._entries
+
+ def _get_class_ids(self) -> np.ndarray:
+ if self._split == _Split.TEST:
+ assert False, "Class IDs are not available in TEST split"
+ if self._class_ids is None:
+ self._class_ids = self._load_extra(self._class_ids_path)
+ assert self._class_ids is not None
+ return self._class_ids
+
+ def _get_class_names(self) -> np.ndarray:
+ if self._split == _Split.TEST:
+ assert False, "Class names are not available in TEST split"
+ if self._class_names is None:
+ self._class_names = self._load_extra(self._class_names_path)
+ assert self._class_names is not None
+ return self._class_names
+
+ def find_class_id(self, class_index: int) -> str:
+ class_ids = self._get_class_ids()
+ return str(class_ids[class_index])
+
+ def find_class_name(self, class_index: int) -> str:
+ class_names = self._get_class_names()
+ return str(class_names[class_index])
+
+ def get_image_data(self, index: int) -> bytes:
+ entries = self._get_entries()
+ actual_index = entries[index]["actual_index"]
+
+ class_id = self.get_class_id(index)
+
+ image_relpath = self.split.get_image_relpath(actual_index, class_id)
+ image_full_path = os.path.join(self.root, image_relpath)
+ with open(image_full_path, mode="rb") as f:
+ image_data = f.read()
+ return image_data
+
+ def get_target(self, index: int) -> Optional[Target]:
+ entries = self._get_entries()
+ class_index = entries[index]["class_index"]
+ return None if self.split == _Split.TEST else int(class_index)
+
+ def get_targets(self) -> Optional[np.ndarray]:
+ entries = self._get_entries()
+ return None if self.split == _Split.TEST else entries["class_index"]
+
+ def get_class_id(self, index: int) -> Optional[str]:
+ entries = self._get_entries()
+ class_id = entries[index]["class_id"]
+ return None if self.split == _Split.TEST else str(class_id)
+
+ def get_class_name(self, index: int) -> Optional[str]:
+ entries = self._get_entries()
+ class_name = entries[index]["class_name"]
+ return None if self.split == _Split.TEST else str(class_name)
+
+ def __len__(self) -> int:
+ entries = self._get_entries()
+ assert len(entries) == self.split.length
+ return len(entries)
+
+ def _load_labels(self, labels_path: str) -> List[Tuple[str, str]]:
+ labels_full_path = os.path.join(self.root, labels_path)
+ labels = []
+
+ try:
+ with open(labels_full_path, "r") as f:
+ reader = csv.reader(f)
+ for row in reader:
+ class_id, class_name = row
+ labels.append((class_id, class_name))
+ except OSError as e:
+ raise RuntimeError(f'can not read labels file "{labels_full_path}"') from e
+
+ return labels
+
+ def _dump_entries(self) -> None:
+ split = self.split
+ if split == ImageNet.Split.TEST:
+ dataset = None
+ sample_count = split.length
+ max_class_id_length, max_class_name_length = 0, 0
+ else:
+ labels_path = "labels.txt"
+ logger.info(f'loading labels from "{labels_path}"')
+ labels = self._load_labels(labels_path)
+
+ # NOTE: Using torchvision ImageFolder for consistency
+ from torchvision.datasets import ImageFolder
+
+ dataset_root = os.path.join(self.root, split.get_dirname())
+ dataset = ImageFolder(dataset_root)
+ sample_count = len(dataset)
+ max_class_id_length, max_class_name_length = -1, -1
+ for sample in dataset.samples:
+ _, class_index = sample
+ class_id, class_name = labels[class_index]
+ max_class_id_length = max(len(class_id), max_class_id_length)
+ max_class_name_length = max(len(class_name), max_class_name_length)
+
+ dtype = np.dtype(
+ [
+ ("actual_index", " old_percent:
+ logger.info(f"creating entries: {percent}%")
+ old_percent = percent
+
+ actual_index = index + 1
+ class_index = np.uint32(-1)
+ class_id, class_name = "", ""
+ entries_array[index] = (actual_index, class_index, class_id, class_name)
+ else:
+ class_names = {class_id: class_name for class_id, class_name in labels}
+
+ assert dataset
+ old_percent = -1
+ for index in range(sample_count):
+ percent = 100 * (index + 1) // sample_count
+ if percent > old_percent:
+ logger.info(f"creating entries: {percent}%")
+ old_percent = percent
+
+ image_full_path, class_index = dataset.samples[index]
+ image_relpath = os.path.relpath(image_full_path, self.root)
+ class_id, actual_index = split.parse_image_relpath(image_relpath)
+ class_name = class_names[class_id]
+ entries_array[index] = (actual_index, class_index, class_id, class_name)
+
+ logger.info(f'saving entries to "{self._entries_path}"')
+ self._save_extra(entries_array, self._entries_path)
+
+ def _dump_class_ids_and_names(self) -> None:
+ split = self.split
+ if split == ImageNet.Split.TEST:
+ return
+
+ entries_array = self._load_extra(self._entries_path)
+
+ max_class_id_length, max_class_name_length, max_class_index = -1, -1, -1
+ for entry in entries_array:
+ class_index, class_id, class_name = (
+ entry["class_index"],
+ entry["class_id"],
+ entry["class_name"],
+ )
+ max_class_index = max(int(class_index), max_class_index)
+ max_class_id_length = max(len(str(class_id)), max_class_id_length)
+ max_class_name_length = max(len(str(class_name)), max_class_name_length)
+
+ class_count = max_class_index + 1
+ class_ids_array = np.empty(class_count, dtype=f"U{max_class_id_length}")
+ class_names_array = np.empty(class_count, dtype=f"U{max_class_name_length}")
+ for entry in entries_array:
+ class_index, class_id, class_name = (
+ entry["class_index"],
+ entry["class_id"],
+ entry["class_name"],
+ )
+ class_ids_array[class_index] = class_id
+ class_names_array[class_index] = class_name
+
+ logger.info(f'saving class IDs to "{self._class_ids_path}"')
+ self._save_extra(class_ids_array, self._class_ids_path)
+
+ logger.info(f'saving class names to "{self._class_names_path}"')
+ self._save_extra(class_names_array, self._class_names_path)
+
+ def dump_extra(self) -> None:
+ self._dump_entries()
+ self._dump_class_ids_and_names()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f8bd43a2356ed0bdde1f1ceac95a10e0590086
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
@@ -0,0 +1,303 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import Enum
+from functools import lru_cache
+from gzip import GzipFile
+from io import BytesIO
+from mmap import ACCESS_READ, mmap
+import os
+from typing import Any, Callable, List, Optional, Set, Tuple
+import warnings
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+_Labels = int
+
+_DEFAULT_MMAP_CACHE_SIZE = 16 # Warning: This can exhaust file descriptors
+
+
+@dataclass
+class _ClassEntry:
+ block_offset: int
+ maybe_filename: Optional[str] = None
+
+
+@dataclass
+class _Entry:
+ class_index: int # noqa: E701
+ start_offset: int
+ end_offset: int
+ filename: str
+
+
+class _Split(Enum):
+ TRAIN = "train"
+ VAL = "val"
+
+ @property
+ def length(self) -> int:
+ return {
+ _Split.TRAIN: 11_797_647,
+ _Split.VAL: 561_050,
+ }[self]
+
+ def entries_path(self):
+ return f"imagenet21kp_{self.value}.txt"
+
+
+def _get_tarball_path(class_id: str) -> str:
+ return f"{class_id}.tar"
+
+
+def _make_mmap_tarball(tarballs_root: str, mmap_cache_size: int):
+ @lru_cache(maxsize=mmap_cache_size)
+ def _mmap_tarball(class_id: str) -> mmap:
+ tarball_path = _get_tarball_path(class_id)
+ tarball_full_path = os.path.join(tarballs_root, tarball_path)
+ with open(tarball_full_path) as f:
+ return mmap(fileno=f.fileno(), length=0, access=ACCESS_READ)
+
+ return _mmap_tarball
+
+
+class ImageNet22k(ExtendedVisionDataset):
+ _GZIPPED_INDICES: Set[int] = {
+ 841_545,
+ 1_304_131,
+ 2_437_921,
+ 2_672_079,
+ 2_795_676,
+ 2_969_786,
+ 6_902_965,
+ 6_903_550,
+ 6_903_628,
+ 7_432_557,
+ 7_432_589,
+ 7_813_809,
+ 8_329_633,
+ 10_296_990,
+ 10_417_652,
+ 10_492_265,
+ 10_598_078,
+ 10_782_398,
+ 10_902_612,
+ 11_203_736,
+ 11_342_890,
+ 11_397_596,
+ 11_589_762,
+ 11_705_103,
+ 12_936_875,
+ 13_289_782,
+ }
+ Labels = _Labels
+
+ def __init__(
+ self,
+ *,
+ root: str,
+ extra: str,
+ transforms: Optional[Callable] = None,
+ transform: Optional[Callable] = None,
+ target_transform: Optional[Callable] = None,
+ mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
+ ) -> None:
+ super().__init__(root, transforms, transform, target_transform)
+ self._extra_root = extra
+
+ entries_path = self._get_entries_path(root)
+ self._entries = self._load_extra(entries_path)
+
+ class_ids_path = self._get_class_ids_path(root)
+ self._class_ids = self._load_extra(class_ids_path)
+
+ self._gzipped_indices = ImageNet22k._GZIPPED_INDICES
+ self._mmap_tarball = _make_mmap_tarball(self._tarballs_root, mmap_cache_size)
+
+ def _get_entries_path(self, root: Optional[str] = None) -> str:
+ return "entries.npy"
+
+ def _get_class_ids_path(self, root: Optional[str] = None) -> str:
+ return "class-ids.npy"
+
+ def _find_class_ids(self, path: str) -> List[str]:
+ class_ids = []
+
+ with os.scandir(path) as entries:
+ for entry in entries:
+ root, ext = os.path.splitext(entry.name)
+ if ext != ".tar":
+ continue
+ class_ids.append(root)
+
+ return sorted(class_ids)
+
+ def _load_entries_class_ids(self, root: Optional[str] = None) -> Tuple[List[_Entry], List[str]]:
+ root = self.get_root(root)
+ entries: List[_Entry] = []
+ class_ids = self._find_class_ids(root)
+
+ for class_index, class_id in enumerate(class_ids):
+ path = os.path.join(root, "blocks", f"{class_id}.log")
+ class_entries = []
+
+ try:
+ with open(path) as f:
+ for line in f:
+ line = line.rstrip()
+ block, filename = line.split(":")
+ block_offset = int(block[6:])
+ filename = filename[1:]
+
+ maybe_filename = None
+ if filename != "** Block of NULs **":
+ maybe_filename = filename
+ _, ext = os.path.splitext(filename)
+ # assert ext == ".JPEG"
+
+ class_entry = _ClassEntry(block_offset, maybe_filename)
+ class_entries.append(class_entry)
+ except OSError as e:
+ raise RuntimeError(f'can not read blocks file "{path}"') from e
+
+ assert class_entries[-1].maybe_filename is None
+
+ for class_entry1, class_entry2 in zip(class_entries, class_entries[1:]):
+ assert class_entry1.block_offset <= class_entry2.block_offset
+ start_offset = 512 * class_entry1.block_offset
+ end_offset = 512 * class_entry2.block_offset
+ assert class_entry1.maybe_filename is not None
+ filename = class_entry1.maybe_filename
+ entry = _Entry(class_index, start_offset, end_offset, filename)
+ # Skip invalid image files (PIL throws UnidentifiedImageError)
+ if filename == "n06470073_47249.JPEG":
+ continue
+ entries.append(entry)
+
+ return entries, class_ids
+
+ def _load_extra(self, extra_path: str) -> np.ndarray:
+ extra_root = self._extra_root
+ extra_full_path = os.path.join(extra_root, extra_path)
+ return np.load(extra_full_path, mmap_mode="r")
+
+ def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+ extra_root = self._extra_root
+ extra_full_path = os.path.join(extra_root, extra_path)
+ os.makedirs(extra_root, exist_ok=True)
+ np.save(extra_full_path, extra_array)
+
+ @property
+ def _tarballs_root(self) -> str:
+ return self.root
+
+ def find_class_id(self, class_index: int) -> str:
+ return str(self._class_ids[class_index])
+
+ def get_image_data(self, index: int) -> bytes:
+ entry = self._entries[index]
+ class_id = entry["class_id"]
+ class_mmap = self._mmap_tarball(class_id)
+
+ start_offset, end_offset = entry["start_offset"], entry["end_offset"]
+ try:
+ mapped_data = class_mmap[start_offset:end_offset]
+ data = mapped_data[512:] # Skip entry header block
+
+ if len(data) >= 2 and tuple(data[:2]) == (0x1F, 0x8B):
+ assert index in self._gzipped_indices, f"unexpected gzip header for sample {index}"
+ with GzipFile(fileobj=BytesIO(data)) as g:
+ data = g.read()
+ except Exception as e:
+ raise RuntimeError(f"can not retrieve image data for sample {index} " f'from "{class_id}" tarball') from e
+
+ return data
+
+ def get_target(self, index: int) -> Any:
+ return int(self._entries[index]["class_index"])
+
+ def get_targets(self) -> np.ndarray:
+ return self._entries["class_index"]
+
+ def get_class_id(self, index: int) -> str:
+ return str(self._entries[index]["class_id"])
+
+ def get_class_ids(self) -> np.ndarray:
+ return self._entries["class_id"]
+
+ def __getitem__(self, index: int) -> Tuple[Any, Any]:
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ return super().__getitem__(index)
+
+ def __len__(self) -> int:
+ return len(self._entries)
+
+ def _dump_entries(self, *args, **kwargs) -> None:
+ entries, class_ids = self._load_entries_class_ids(*args, **kwargs)
+
+ max_class_id_length, max_filename_length, max_class_index = -1, -1, -1
+ for entry in entries:
+ class_id = class_ids[entry.class_index]
+ max_class_index = max(entry.class_index, max_class_index)
+ max_class_id_length = max(len(class_id), max_class_id_length)
+ max_filename_length = max(len(entry.filename), max_filename_length)
+
+ dtype = np.dtype(
+ [
+ ("class_index", " None:
+ entries_path = self._get_entries_path(*args, **kwargs)
+ entries_array = self._load_extra(entries_path)
+
+ max_class_id_length, max_class_index = -1, -1
+ for entry in entries_array:
+ class_index, class_id = entry["class_index"], entry["class_id"]
+ max_class_index = max(int(class_index), max_class_index)
+ max_class_id_length = max(len(str(class_id)), max_class_id_length)
+
+ class_ids_array = np.empty(max_class_index + 1, dtype=f"U{max_class_id_length}")
+ for entry in entries_array:
+ class_index, class_id = entry["class_index"], entry["class_id"]
+ class_ids_array[class_index] = class_id
+ class_ids_path = self._get_class_ids_path(*args, **kwargs)
+ self._save_extra(class_ids_array, class_ids_path)
+
+ def _dump_extra(self, *args, **kwargs) -> None:
+ self._dump_entries(*args, *kwargs)
+ self._dump_class_ids(*args, *kwargs)
+
+ def dump_extra(self, root: Optional[str] = None) -> None:
+ return self._dump_extra(root)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb430b6ef88b1d6f536e52b8ec78703b4023a28
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
@@ -0,0 +1,223 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+from typing import Any, Callable, List, Optional, TypeVar
+
+import torch
+from torch.utils.data import Sampler
+
+from .datasets import ImageNet, ImageNet22k
+from .samplers import EpochSampler, InfiniteSampler, ShardedInfiniteSampler
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SamplerType(Enum):
+ DISTRIBUTED = 0
+ EPOCH = 1
+ INFINITE = 2
+ SHARDED_INFINITE = 3
+ SHARDED_INFINITE_NEW = 4
+
+
+def _make_bool_str(b: bool) -> str:
+ return "yes" if b else "no"
+
+
+def _make_sample_transform(image_transform: Optional[Callable] = None, target_transform: Optional[Callable] = None):
+ def transform(sample):
+ image, target = sample
+ if image_transform is not None:
+ image = image_transform(image)
+ if target_transform is not None:
+ target = target_transform(target)
+ return image, target
+
+ return transform
+
+
+def _parse_dataset_str(dataset_str: str):
+ tokens = dataset_str.split(":")
+
+ name = tokens[0]
+ kwargs = {}
+
+ for token in tokens[1:]:
+ key, value = token.split("=")
+ assert key in ("root", "extra", "split")
+ kwargs[key] = value
+
+ if name == "ImageNet":
+ class_ = ImageNet
+ if "split" in kwargs:
+ kwargs["split"] = ImageNet.Split[kwargs["split"]]
+ elif name == "ImageNet22k":
+ class_ = ImageNet22k
+ else:
+ raise ValueError(f'Unsupported dataset "{name}"')
+
+ return class_, kwargs
+
+
+def make_dataset(
+ *,
+ dataset_str: str,
+ transform: Optional[Callable] = None,
+ target_transform: Optional[Callable] = None,
+):
+ """
+ Creates a dataset with the specified parameters.
+
+ Args:
+ dataset_str: A dataset string description (e.g. ImageNet:split=TRAIN).
+ transform: A transform to apply to images.
+ target_transform: A transform to apply to targets.
+
+ Returns:
+ The created dataset.
+ """
+ logger.info(f'using dataset: "{dataset_str}"')
+
+ class_, kwargs = _parse_dataset_str(dataset_str)
+ dataset = class_(transform=transform, target_transform=target_transform, **kwargs)
+
+ logger.info(f"# of dataset samples: {len(dataset):,d}")
+
+ # Aggregated datasets do not expose (yet) these attributes, so add them.
+ if not hasattr(dataset, "transform"):
+ setattr(dataset, "transform", transform)
+ if not hasattr(dataset, "target_transform"):
+ setattr(dataset, "target_transform", target_transform)
+
+ return dataset
+
+
+def _make_sampler(
+ *,
+ dataset,
+ type: Optional[SamplerType] = None,
+ shuffle: bool = False,
+ seed: int = 0,
+ size: int = -1,
+ advance: int = 0,
+) -> Optional[Sampler]:
+ sample_count = len(dataset)
+
+ if type == SamplerType.INFINITE:
+ logger.info("sampler: infinite")
+ if size > 0:
+ raise ValueError("sampler size > 0 is invalid")
+ return InfiniteSampler(
+ sample_count=sample_count,
+ shuffle=shuffle,
+ seed=seed,
+ advance=advance,
+ )
+ elif type in (SamplerType.SHARDED_INFINITE, SamplerType.SHARDED_INFINITE_NEW):
+ logger.info("sampler: sharded infinite")
+ if size > 0:
+ raise ValueError("sampler size > 0 is invalid")
+ # TODO: Remove support for old shuffling
+ use_new_shuffle_tensor_slice = type == SamplerType.SHARDED_INFINITE_NEW
+ return ShardedInfiniteSampler(
+ sample_count=sample_count,
+ shuffle=shuffle,
+ seed=seed,
+ advance=advance,
+ use_new_shuffle_tensor_slice=use_new_shuffle_tensor_slice,
+ )
+ elif type == SamplerType.EPOCH:
+ logger.info("sampler: epoch")
+ if advance > 0:
+ raise NotImplementedError("sampler advance > 0 is not supported")
+ size = size if size > 0 else sample_count
+ logger.info(f"# of samples / epoch: {size:,d}")
+ return EpochSampler(
+ size=size,
+ sample_count=sample_count,
+ shuffle=shuffle,
+ seed=seed,
+ )
+ elif type == SamplerType.DISTRIBUTED:
+ logger.info("sampler: distributed")
+ if size > 0:
+ raise ValueError("sampler size > 0 is invalid")
+ if advance > 0:
+ raise ValueError("sampler advance > 0 is invalid")
+ return torch.utils.data.DistributedSampler(
+ dataset=dataset,
+ shuffle=shuffle,
+ seed=seed,
+ drop_last=False,
+ )
+
+ logger.info("sampler: none")
+ return None
+
+
+T = TypeVar("T")
+
+
+def make_data_loader(
+ *,
+ dataset,
+ batch_size: int,
+ num_workers: int,
+ shuffle: bool = True,
+ seed: int = 0,
+ sampler_type: Optional[SamplerType] = SamplerType.INFINITE,
+ sampler_size: int = -1,
+ sampler_advance: int = 0,
+ drop_last: bool = True,
+ persistent_workers: bool = False,
+ collate_fn: Optional[Callable[[List[T]], Any]] = None,
+):
+ """
+ Creates a data loader with the specified parameters.
+
+ Args:
+ dataset: A dataset (third party, LaViDa or WebDataset).
+ batch_size: The size of batches to generate.
+ num_workers: The number of workers to use.
+ shuffle: Whether to shuffle samples.
+ seed: The random seed to use.
+ sampler_type: Which sampler to use: EPOCH, INFINITE, SHARDED_INFINITE, SHARDED_INFINITE_NEW, DISTRIBUTED or None.
+ sampler_size: The number of images per epoch (when applicable) or -1 for the entire dataset.
+ sampler_advance: How many samples to skip (when applicable).
+ drop_last: Whether the last non-full batch of data should be dropped.
+ persistent_workers: maintain the workers Dataset instances alive after a dataset has been consumed once.
+ collate_fn: Function that performs batch collation
+ """
+
+ sampler = _make_sampler(
+ dataset=dataset,
+ type=sampler_type,
+ shuffle=shuffle,
+ seed=seed,
+ size=sampler_size,
+ advance=sampler_advance,
+ )
+
+ logger.info("using PyTorch data loader")
+ data_loader = torch.utils.data.DataLoader(
+ dataset,
+ sampler=sampler,
+ batch_size=batch_size,
+ num_workers=num_workers,
+ pin_memory=True,
+ drop_last=drop_last,
+ persistent_workers=persistent_workers,
+ collate_fn=collate_fn,
+ )
+
+ try:
+ logger.info(f"# of batches: {len(data_loader):,d}")
+ except TypeError: # data loader has no length
+ logger.info("infinite data loader")
+ return data_loader
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc07293bca0ae20358830a709e489abfd390cae
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import math
+import numpy as np
+
+
+class MaskingGenerator:
+ def __init__(
+ self,
+ input_size,
+ num_masking_patches=None,
+ min_num_patches=4,
+ max_num_patches=None,
+ min_aspect=0.3,
+ max_aspect=None,
+ ):
+ if not isinstance(input_size, tuple):
+ input_size = (input_size,) * 2
+ self.height, self.width = input_size
+
+ self.num_patches = self.height * self.width
+ self.num_masking_patches = num_masking_patches
+
+ self.min_num_patches = min_num_patches
+ self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
+
+ max_aspect = max_aspect or 1 / min_aspect
+ self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+ def __repr__(self):
+ repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+ self.height,
+ self.width,
+ self.min_num_patches,
+ self.max_num_patches,
+ self.num_masking_patches,
+ self.log_aspect_ratio[0],
+ self.log_aspect_ratio[1],
+ )
+ return repr_str
+
+ def get_shape(self):
+ return self.height, self.width
+
+ def _mask(self, mask, max_mask_patches):
+ delta = 0
+ for _ in range(10):
+ target_area = random.uniform(self.min_num_patches, max_mask_patches)
+ aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+ h = int(round(math.sqrt(target_area * aspect_ratio)))
+ w = int(round(math.sqrt(target_area / aspect_ratio)))
+ if w < self.width and h < self.height:
+ top = random.randint(0, self.height - h)
+ left = random.randint(0, self.width - w)
+
+ num_masked = mask[top : top + h, left : left + w].sum()
+ # Overlap
+ if 0 < h * w - num_masked <= max_mask_patches:
+ for i in range(top, top + h):
+ for j in range(left, left + w):
+ if mask[i, j] == 0:
+ mask[i, j] = 1
+ delta += 1
+
+ if delta > 0:
+ break
+ return delta
+
+ def __call__(self, num_masking_patches=0):
+ mask = np.zeros(shape=self.get_shape(), dtype=bool)
+ mask_count = 0
+ while mask_count < num_masking_patches:
+ max_mask_patches = num_masking_patches - mask_count
+ max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+ delta = self._mask(mask, max_mask_patches)
+ if delta == 0:
+ break
+ else:
+ mask_count += delta
+
+ return mask
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f65d49df48c0a0a54b172466d21471cbce8e1ac
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
@@ -0,0 +1,230 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+from typing import Any, Optional
+import warnings
+
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+
+
+class EpochSampler(Sampler):
+ def __init__(
+ self,
+ *,
+ size: int,
+ sample_count: int,
+ shuffle: bool = False,
+ seed: int = 0,
+ start: Optional[int] = None,
+ step: Optional[int] = None,
+ ):
+ self._size = size
+ self._sample_count = sample_count
+ self._shuffle = shuffle
+ self._seed = seed
+ self._start = distributed.get_global_rank() if start is None else start
+ self._step = distributed.get_global_size() if step is None else step
+ self._epoch = 0
+
+ def __iter__(self):
+ count = (self._size + self._sample_count - 1) // self._sample_count
+ tiled_indices = np.tile(np.arange(self._sample_count), count)
+ if self._shuffle:
+ seed = self._seed * self._epoch if self._seed != 0 else self._epoch
+ rng = np.random.default_rng(seed)
+ iterable = rng.choice(tiled_indices, self._size, replace=False)
+ else:
+ iterable = tiled_indices[: self._size]
+
+ yield from itertools.islice(iterable, self._start, None, self._step)
+
+ def __len__(self):
+ return (self._size - self._start + self._step - 1) // self._step
+
+ def set_epoch(self, epoch):
+ self._epoch = epoch
+
+
+def _get_numpy_dtype(size: int) -> Any:
+ return np.int32 if size <= 2**31 else np.int64
+
+
+def _get_torch_dtype(size: int) -> Any:
+ return torch.int32 if size <= 2**31 else torch.int64
+
+
+def _generate_randperm_indices(*, size: int, generator: torch.Generator):
+ """Generate the indices of a random permutation."""
+ dtype = _get_torch_dtype(size)
+ # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
+ perm = torch.arange(size, dtype=dtype)
+ for i in range(size):
+ j = torch.randint(i, size, size=(1,), generator=generator).item()
+
+ # Always swap even if no-op
+ value = perm[j].item()
+ perm[j] = perm[i].item()
+ perm[i] = value
+ yield value
+
+
+class InfiniteSampler(Sampler):
+ def __init__(
+ self,
+ *,
+ sample_count: int,
+ shuffle: bool = False,
+ seed: int = 0,
+ start: Optional[int] = None,
+ step: Optional[int] = None,
+ advance: int = 0,
+ ):
+ self._sample_count = sample_count
+ self._seed = seed
+ self._shuffle = shuffle
+ self._start = distributed.get_global_rank() if start is None else start
+ self._step = distributed.get_global_size() if step is None else step
+ self._advance = advance
+
+ def __iter__(self):
+ if self._shuffle:
+ iterator = self._shuffled_iterator()
+ else:
+ iterator = self._iterator()
+
+ yield from itertools.islice(iterator, self._advance, None)
+
+ def _iterator(self):
+ assert not self._shuffle
+
+ while True:
+ iterable = range(self._sample_count)
+ yield from itertools.islice(iterable, self._start, None, self._step)
+
+ def _shuffled_iterator(self):
+ assert self._shuffle
+
+ # Instantiate a generator here (rather than in the ctor) to keep the class
+ # picklable (requirement of mp.spawn)
+ generator = torch.Generator().manual_seed(self._seed)
+
+ while True:
+ iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
+ yield from itertools.islice(iterable, self._start, None, self._step)
+
+
+# The following function is somewhat equivalent to _new_shuffle_tensor_slice below,
+# but avoids a full in-place random permutation generation.
+def _shuffle_tensor_slice(
+ *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+ stop = len(tensor)
+ count = stop // step
+ drop_count = stop - step * count
+ if drop_count:
+ warnings.warn(f"# of dropped samples: {drop_count}")
+
+ dtype = _get_numpy_dtype(stop)
+ result = np.empty(count, dtype=dtype)
+
+ for i in range(count):
+ j = torch.randint(0, i + 1, size=(1,), generator=generator).item() if i > 0 else 0
+
+ result[i] = result[j]
+ result[j] = tensor[start + i * step].item()
+
+ return result
+
+
+def _new_shuffle_tensor_slice(
+ *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+ stop = len(tensor)
+ count = stop // step
+ dtype = torch.int64 # Needed for using randperm result as indices
+ count = stop // step
+ drop_count = stop - step * count
+ if drop_count:
+ warnings.warn(f"# of dropped samples: {drop_count}")
+ indices = torch.randperm(count, dtype=dtype, generator=generator)
+ return tensor[start::step][indices].numpy()
+
+
+def _make_seed(seed: int, start: int, iter_count: int) -> int:
+ # NOTE: Tried a few variants (including iter_count << 32), this one worked best.
+ return seed + start + (iter_count << 24)
+
+
+class ShardedInfiniteSampler(Sampler):
+ def __init__(
+ self,
+ *,
+ sample_count: int,
+ shuffle: bool = False,
+ seed: int = 0,
+ start: Optional[int] = None,
+ step: Optional[int] = None,
+ advance: int = 0,
+ use_new_shuffle_tensor_slice: bool = False,
+ ):
+ self._sample_count = sample_count
+ self._seed = seed
+ self._shuffle = shuffle
+ self._start = distributed.get_global_rank() if start is None else start
+ self._step = distributed.get_global_size() if step is None else step
+ self._advance = advance
+ self._iter_count = 0
+ self._shuffle_tensor_slice_fn = (
+ _new_shuffle_tensor_slice if use_new_shuffle_tensor_slice else _shuffle_tensor_slice
+ )
+
+ def __iter__(self):
+ iter_count = self._advance // self._sample_count
+ if iter_count > 0:
+ self._advance -= iter_count * self._sample_count
+ self._iter_count += iter_count
+
+ if self._shuffle:
+ iterator = self._shuffled_iterator()
+ else:
+ iterator = self._iterator()
+
+ yield from itertools.islice(iterator, self._advance, None)
+
+ def _iterator(self):
+ assert not self._shuffle
+
+ while True:
+ iterable = range(self._sample_count)
+ yield from itertools.islice(iterable, self._start, None, self._step)
+
+ def _shuffled_iterator(self):
+ assert self._shuffle
+
+ # Instantiate a generator here (rather than in the ctor) to be keep the class
+ # picklable (requirement of mp.spawn)
+ generator = torch.Generator()
+
+ # Always shuffle everything first
+ generator.manual_seed(self._seed)
+ dtype = _get_torch_dtype(self._sample_count)
+ perm = torch.randperm(self._sample_count, dtype=dtype, generator=generator)
+
+ while True:
+ # Re-seed on each iteration to allow skipping whole permutations
+ seed = _make_seed(self._seed, self._start, self._iter_count)
+ generator.manual_seed(seed)
+
+ iterable = self._shuffle_tensor_slice_fn(
+ tensor=perm, start=self._start, step=self._step, generator=generator
+ )
+ yield from iterable
+ self._iter_count += 1
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..a086bdad17e6c9447b86f1fef596d2a739fc0ac8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import torch
+from torchvision import transforms
+
+
+class GaussianBlur(transforms.RandomApply):
+ """
+ Apply Gaussian Blur to the PIL image.
+ """
+
+ def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
+ # NOTE: torchvision is applying 1 - probability to return the original image
+ keep_p = 1 - p
+ transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
+ super().__init__(transforms=[transform], p=keep_p)
+
+
+class MaybeToTensor(transforms.ToTensor):
+ """
+ Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
+ """
+
+ def __call__(self, pic):
+ """
+ Args:
+ pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
+ Returns:
+ Tensor: Converted image.
+ """
+ if isinstance(pic, torch.Tensor):
+ return pic
+ return super().__call__(pic)
+
+
+# Use timm's names
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def make_normalize_transform(
+ mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+ std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Normalize:
+ return transforms.Normalize(mean=mean, std=std)
+
+
+# This roughly matches torchvision's preset for classification training:
+# https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
+def make_classification_train_transform(
+ *,
+ crop_size: int = 224,
+ interpolation=transforms.InterpolationMode.BICUBIC,
+ hflip_prob: float = 0.5,
+ mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+ std: Sequence[float] = IMAGENET_DEFAULT_STD,
+):
+ transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+ if hflip_prob > 0.0:
+ transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
+ transforms_list.extend(
+ [
+ MaybeToTensor(),
+ make_normalize_transform(mean=mean, std=std),
+ ]
+ )
+ return transforms.Compose(transforms_list)
+
+
+# This matches (roughly) torchvision's preset for classification evaluation:
+# https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
+def make_classification_eval_transform(
+ *,
+ resize_size: int = 256,
+ interpolation=transforms.InterpolationMode.BICUBIC,
+ crop_size: int = 224,
+ mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+ std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Compose:
+ transforms_list = [
+ transforms.Resize(resize_size, interpolation=interpolation),
+ transforms.CenterCrop(crop_size),
+ MaybeToTensor(),
+ make_normalize_transform(mean=mean, std=std),
+ ]
+ return transforms.Compose(transforms_list)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3526db2286a0a331f379cc3aa5d592870acde533
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
@@ -0,0 +1,271 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import re
+import socket
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+
+_LOCAL_RANK = -1
+_LOCAL_WORLD_SIZE = -1
+
+
+def is_enabled() -> bool:
+ """
+ Returns:
+ True if distributed training is enabled
+ """
+ return dist.is_available() and dist.is_initialized()
+
+
+def get_global_size() -> int:
+ """
+ Returns:
+ The number of processes in the process group
+ """
+ return dist.get_world_size() if is_enabled() else 1
+
+
+def get_global_rank() -> int:
+ """
+ Returns:
+ The rank of the current process within the global process group.
+ """
+ return dist.get_rank() if is_enabled() else 0
+
+
+def get_local_rank() -> int:
+ """
+ Returns:
+ The rank of the current process within the local (per-machine) process group.
+ """
+ if not is_enabled():
+ return 0
+ assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+ return _LOCAL_RANK
+
+
+def get_local_size() -> int:
+ """
+ Returns:
+ The size of the per-machine process group,
+ i.e. the number of processes per machine.
+ """
+ if not is_enabled():
+ return 1
+ assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+ return _LOCAL_WORLD_SIZE
+
+
+def is_main_process() -> bool:
+ """
+ Returns:
+ True if the current process is the main one.
+ """
+ return get_global_rank() == 0
+
+
+def _restrict_print_to_main_process() -> None:
+ """
+ This function disables printing when not in the main process
+ """
+ import builtins as __builtin__
+
+ builtin_print = __builtin__.print
+
+ def print(*args, **kwargs):
+ force = kwargs.pop("force", False)
+ if is_main_process() or force:
+ builtin_print(*args, **kwargs)
+
+ __builtin__.print = print
+
+
+def _get_master_port(seed: int = 0) -> int:
+ MIN_MASTER_PORT, MAX_MASTER_PORT = (20_000, 60_000)
+
+ master_port_str = os.environ.get("MASTER_PORT")
+ if master_port_str is None:
+ rng = random.Random(seed)
+ return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
+
+ return int(master_port_str)
+
+
+def _get_available_port() -> int:
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ # A "" host address means INADDR_ANY i.e. binding to all interfaces.
+ # Note this is not compatible with IPv6.
+ s.bind(("", 0))
+ port = s.getsockname()[1]
+ return port
+
+
+_TORCH_DISTRIBUTED_ENV_VARS = (
+ "MASTER_ADDR",
+ "MASTER_PORT",
+ "RANK",
+ "WORLD_SIZE",
+ "LOCAL_RANK",
+ "LOCAL_WORLD_SIZE",
+)
+
+
+def _collect_env_vars() -> Dict[str, str]:
+ return {env_var: os.environ[env_var] for env_var in _TORCH_DISTRIBUTED_ENV_VARS if env_var in os.environ}
+
+
+def _is_slurm_job_process() -> bool:
+ return "SLURM_JOB_ID" in os.environ
+
+
+def _parse_slurm_node_list(s: str) -> List[str]:
+ nodes = []
+ # Extract "hostname", "hostname[1-2,3,4-5]," substrings
+ p = re.compile(r"(([^\[]+)(?:\[([^\]]+)\])?),?")
+ for m in p.finditer(s):
+ prefix, suffixes = s[m.start(2) : m.end(2)], s[m.start(3) : m.end(3)]
+ for suffix in suffixes.split(","):
+ span = suffix.split("-")
+ if len(span) == 1:
+ nodes.append(prefix + suffix)
+ else:
+ width = len(span[0])
+ start, end = int(span[0]), int(span[1]) + 1
+ nodes.extend([prefix + f"{i:0{width}}" for i in range(start, end)])
+ return nodes
+
+
+def _check_env_variable(key: str, new_value: str):
+ # Only check for difference with preset environment variables
+ if key in os.environ and os.environ[key] != new_value:
+ raise RuntimeError(f"Cannot export environment variables as {key} is already set")
+
+
+class _TorchDistributedEnvironment:
+ def __init__(self):
+ self.master_addr = "127.0.0.1"
+ self.master_port = 0
+ self.rank = -1
+ self.world_size = -1
+ self.local_rank = -1
+ self.local_world_size = -1
+
+ if _is_slurm_job_process():
+ return self._set_from_slurm_env()
+
+ env_vars = _collect_env_vars()
+ if not env_vars:
+ # Environment is not set
+ pass
+ elif len(env_vars) == len(_TORCH_DISTRIBUTED_ENV_VARS):
+ # Environment is fully set
+ return self._set_from_preset_env()
+ else:
+ # Environment is partially set
+ collected_env_vars = ", ".join(env_vars.keys())
+ raise RuntimeError(f"Partially set environment: {collected_env_vars}")
+
+ if torch.cuda.device_count() > 0:
+ return self._set_from_local()
+
+ raise RuntimeError("Can't initialize PyTorch distributed environment")
+
+ # Slurm job created with sbatch, submitit, etc...
+ def _set_from_slurm_env(self):
+ # logger.info("Initialization from Slurm environment")
+ job_id = int(os.environ["SLURM_JOB_ID"])
+ node_count = int(os.environ["SLURM_JOB_NUM_NODES"])
+ nodes = _parse_slurm_node_list(os.environ["SLURM_JOB_NODELIST"])
+ assert len(nodes) == node_count
+
+ self.master_addr = nodes[0]
+ self.master_port = _get_master_port(seed=job_id)
+ self.rank = int(os.environ["SLURM_PROCID"])
+ self.world_size = int(os.environ["SLURM_NTASKS"])
+ assert self.rank < self.world_size
+ self.local_rank = int(os.environ["SLURM_LOCALID"])
+ self.local_world_size = self.world_size // node_count
+ assert self.local_rank < self.local_world_size
+
+ # Single node job with preset environment (i.e. torchrun)
+ def _set_from_preset_env(self):
+ # logger.info("Initialization from preset environment")
+ self.master_addr = os.environ["MASTER_ADDR"]
+ self.master_port = os.environ["MASTER_PORT"]
+ self.rank = int(os.environ["RANK"])
+ self.world_size = int(os.environ["WORLD_SIZE"])
+ assert self.rank < self.world_size
+ self.local_rank = int(os.environ["LOCAL_RANK"])
+ self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+ assert self.local_rank < self.local_world_size
+
+ # Single node and GPU job (i.e. local script run)
+ def _set_from_local(self):
+ # logger.info("Initialization from local")
+ self.master_addr = "127.0.0.1"
+ self.master_port = _get_available_port()
+ self.rank = 0
+ self.world_size = 1
+ self.local_rank = 0
+ self.local_world_size = 1
+
+ def export(self, *, overwrite: bool) -> "_TorchDistributedEnvironment":
+ # See the "Environment variable initialization" section from
+ # https://pytorch.org/docs/stable/distributed.html for the complete list of
+ # environment variables required for the env:// initialization method.
+ env_vars = {
+ "MASTER_ADDR": self.master_addr,
+ "MASTER_PORT": str(self.master_port),
+ "RANK": str(self.rank),
+ "WORLD_SIZE": str(self.world_size),
+ "LOCAL_RANK": str(self.local_rank),
+ "LOCAL_WORLD_SIZE": str(self.local_world_size),
+ }
+ if not overwrite:
+ for k, v in env_vars.items():
+ _check_env_variable(k, v)
+
+ os.environ.update(env_vars)
+ return self
+
+
+def enable(*, set_cuda_current_device: bool = True, overwrite: bool = False, allow_nccl_timeout: bool = False):
+ """Enable distributed mode
+
+ Args:
+ set_cuda_current_device: If True, call torch.cuda.set_device() to set the
+ current PyTorch CUDA device to the one matching the local rank.
+ overwrite: If True, overwrites already set variables. Else fails.
+ """
+
+ global _LOCAL_RANK, _LOCAL_WORLD_SIZE
+ if _LOCAL_RANK >= 0 or _LOCAL_WORLD_SIZE >= 0:
+ raise RuntimeError("Distributed mode has already been enabled")
+ torch_env = _TorchDistributedEnvironment()
+ torch_env.export(overwrite=overwrite)
+
+ if set_cuda_current_device:
+ torch.cuda.set_device(torch_env.local_rank)
+
+ if allow_nccl_timeout:
+ # This allows to use torch distributed timeout in a NCCL backend
+ key, value = "NCCL_ASYNC_ERROR_HANDLING", "1"
+ if not overwrite:
+ _check_env_variable(key, value)
+ os.environ[key] = value
+
+ dist.init_process_group(backend="nccl")
+ dist.barrier()
+
+ # Finalize setup
+ _LOCAL_RANK = torch_env.local_rank
+ _LOCAL_WORLD_SIZE = torch_env.local_world_size
+ _restrict_print_to_main_process()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4196294309799347172dba54a17360698071ca8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1dbab9ff817bf202c8049ffd4c6810673e378d7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
@@ -0,0 +1,405 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import torch
+from torch.nn.functional import one_hot, softmax
+
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data import SamplerType, make_data_loader, make_dataset
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data.transforms import make_classification_eval_transform
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.metrics import AccuracyAveraging, build_topk_accuracy_metric
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.setup import setup_and_build_model
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+ description: Optional[str] = None,
+ parents: Optional[List[argparse.ArgumentParser]] = None,
+ add_help: bool = True,
+):
+ parents = parents or []
+ setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+ parents = [setup_args_parser]
+ parser = argparse.ArgumentParser(
+ description=description,
+ parents=parents,
+ add_help=add_help,
+ )
+ parser.add_argument(
+ "--train-dataset",
+ dest="train_dataset_str",
+ type=str,
+ help="Training dataset",
+ )
+ parser.add_argument(
+ "--val-dataset",
+ dest="val_dataset_str",
+ type=str,
+ help="Validation dataset",
+ )
+ parser.add_argument(
+ "--nb_knn",
+ nargs="+",
+ type=int,
+ help="Number of NN to use. 20 is usually working the best.",
+ )
+ parser.add_argument(
+ "--temperature",
+ type=float,
+ help="Temperature used in the voting coefficient",
+ )
+ parser.add_argument(
+ "--gather-on-cpu",
+ action="store_true",
+ help="Whether to gather the train features on cpu, slower"
+ "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ help="Batch size.",
+ )
+ parser.add_argument(
+ "--n-per-class-list",
+ nargs="+",
+ type=int,
+ help="Number to take per class",
+ )
+ parser.add_argument(
+ "--n-tries",
+ type=int,
+ help="Number of tries",
+ )
+ parser.set_defaults(
+ train_dataset_str="ImageNet:split=TRAIN",
+ val_dataset_str="ImageNet:split=VAL",
+ nb_knn=[10, 20, 100, 200],
+ temperature=0.07,
+ batch_size=256,
+ n_per_class_list=[-1],
+ n_tries=1,
+ )
+ return parser
+
+
+class KnnModule(torch.nn.Module):
+ """
+ Gets knn of test features from all processes on a chunk of the train features
+
+ Each rank gets a chunk of the train features as well as a chunk of the test features.
+ In `compute_neighbors`, for each rank one after the other, its chunk of test features
+ is sent to all devices, partial knns are computed with each chunk of train features
+ then collated back on the original device.
+ """
+
+ def __init__(self, train_features, train_labels, nb_knn, T, device, num_classes=1000):
+ super().__init__()
+
+ self.global_rank = distributed.get_global_rank()
+ self.global_size = distributed.get_global_size()
+
+ self.device = device
+ self.train_features_rank_T = train_features.chunk(self.global_size)[self.global_rank].T.to(self.device)
+ self.candidates = train_labels.chunk(self.global_size)[self.global_rank].view(1, -1).to(self.device)
+
+ self.nb_knn = nb_knn
+ self.max_k = max(self.nb_knn)
+ self.T = T
+ self.num_classes = num_classes
+
+ def _get_knn_sims_and_labels(self, similarity, train_labels):
+ topk_sims, indices = similarity.topk(self.max_k, largest=True, sorted=True)
+ neighbors_labels = torch.gather(train_labels, 1, indices)
+ return topk_sims, neighbors_labels
+
+ def _similarity_for_rank(self, features_rank, source_rank):
+ # Send the features from `source_rank` to all ranks
+ broadcast_shape = torch.tensor(features_rank.shape).to(self.device)
+ torch.distributed.broadcast(broadcast_shape, source_rank)
+
+ broadcasted = features_rank
+ if self.global_rank != source_rank:
+ broadcasted = torch.zeros(*broadcast_shape, dtype=features_rank.dtype, device=self.device)
+ torch.distributed.broadcast(broadcasted, source_rank)
+
+ # Compute the neighbors for `source_rank` among `train_features_rank_T`
+ similarity_rank = torch.mm(broadcasted, self.train_features_rank_T)
+ candidate_labels = self.candidates.expand(len(similarity_rank), -1)
+ return self._get_knn_sims_and_labels(similarity_rank, candidate_labels)
+
+ def _gather_all_knn_for_rank(self, topk_sims, neighbors_labels, target_rank):
+ # Gather all neighbors for `target_rank`
+ topk_sims_rank = retrieved_rank = None
+ if self.global_rank == target_rank:
+ topk_sims_rank = [torch.zeros_like(topk_sims) for _ in range(self.global_size)]
+ retrieved_rank = [torch.zeros_like(neighbors_labels) for _ in range(self.global_size)]
+
+ torch.distributed.gather(topk_sims, topk_sims_rank, dst=target_rank)
+ torch.distributed.gather(neighbors_labels, retrieved_rank, dst=target_rank)
+
+ if self.global_rank == target_rank:
+ # Perform a second top-k on the k * global_size retrieved neighbors
+ topk_sims_rank = torch.cat(topk_sims_rank, dim=1)
+ retrieved_rank = torch.cat(retrieved_rank, dim=1)
+ results = self._get_knn_sims_and_labels(topk_sims_rank, retrieved_rank)
+ return results
+ return None
+
+ def compute_neighbors(self, features_rank):
+ for rank in range(self.global_size):
+ topk_sims, neighbors_labels = self._similarity_for_rank(features_rank, rank)
+ results = self._gather_all_knn_for_rank(topk_sims, neighbors_labels, rank)
+ if results is not None:
+ topk_sims_rank, neighbors_labels_rank = results
+ return topk_sims_rank, neighbors_labels_rank
+
+ def forward(self, features_rank):
+ """
+ Compute the results on all values of `self.nb_knn` neighbors from the full `self.max_k`
+ """
+ assert all(k <= self.max_k for k in self.nb_knn)
+
+ topk_sims, neighbors_labels = self.compute_neighbors(features_rank)
+ batch_size = neighbors_labels.shape[0]
+ topk_sims_transform = softmax(topk_sims / self.T, 1)
+ matmul = torch.mul(
+ one_hot(neighbors_labels, num_classes=self.num_classes),
+ topk_sims_transform.view(batch_size, -1, 1),
+ )
+ probas_for_k = {k: torch.sum(matmul[:, :k, :], 1) for k in self.nb_knn}
+ return probas_for_k
+
+
+class DictKeysModule(torch.nn.Module):
+ def __init__(self, keys):
+ super().__init__()
+ self.keys = keys
+
+ def forward(self, features_dict, targets):
+ for k in self.keys:
+ features_dict = features_dict[k]
+ return {"preds": features_dict, "target": targets}
+
+
+def create_module_dict(*, module, n_per_class_list, n_tries, nb_knn, train_features, train_labels):
+ modules = {}
+ mapping = create_class_indices_mapping(train_labels)
+ for npc in n_per_class_list:
+ if npc < 0: # Only one try needed when using the full data
+ full_module = module(
+ train_features=train_features,
+ train_labels=train_labels,
+ nb_knn=nb_knn,
+ )
+ modules["full"] = ModuleDictWithForward({"1": full_module})
+ continue
+ all_tries = {}
+ for t in range(n_tries):
+ final_indices = filter_train(mapping, npc, seed=t)
+ k_list = list(set(nb_knn + [npc]))
+ k_list = sorted([el for el in k_list if el <= npc])
+ all_tries[str(t)] = module(
+ train_features=train_features[final_indices],
+ train_labels=train_labels[final_indices],
+ nb_knn=k_list,
+ )
+ modules[f"{npc} per class"] = ModuleDictWithForward(all_tries)
+
+ return ModuleDictWithForward(modules)
+
+
+def filter_train(mapping, n_per_class, seed):
+ torch.manual_seed(seed)
+ final_indices = []
+ for k in mapping.keys():
+ index = torch.randperm(len(mapping[k]))[:n_per_class]
+ final_indices.append(mapping[k][index])
+ return torch.cat(final_indices).squeeze()
+
+
+def create_class_indices_mapping(labels):
+ unique_labels, inverse = torch.unique(labels, return_inverse=True)
+ mapping = {unique_labels[i]: (inverse == i).nonzero() for i in range(len(unique_labels))}
+ return mapping
+
+
+class ModuleDictWithForward(torch.nn.ModuleDict):
+ def forward(self, *args, **kwargs):
+ return {k: module(*args, **kwargs) for k, module in self._modules.items()}
+
+
+def eval_knn(
+ model,
+ train_dataset,
+ val_dataset,
+ accuracy_averaging,
+ nb_knn,
+ temperature,
+ batch_size,
+ num_workers,
+ gather_on_cpu,
+ n_per_class_list=[-1],
+ n_tries=1,
+):
+ model = ModelWithNormalize(model)
+
+ logger.info("Extracting features for train set...")
+ train_features, train_labels = extract_features(
+ model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
+ )
+ logger.info(f"Train features created, shape {train_features.shape}.")
+
+ val_dataloader = make_data_loader(
+ dataset=val_dataset,
+ batch_size=batch_size,
+ num_workers=num_workers,
+ sampler_type=SamplerType.DISTRIBUTED,
+ drop_last=False,
+ shuffle=False,
+ persistent_workers=True,
+ )
+ num_classes = train_labels.max() + 1
+ metric_collection = build_topk_accuracy_metric(accuracy_averaging, num_classes=num_classes)
+
+ device = torch.cuda.current_device()
+ partial_module = partial(KnnModule, T=temperature, device=device, num_classes=num_classes)
+ knn_module_dict = create_module_dict(
+ module=partial_module,
+ n_per_class_list=n_per_class_list,
+ n_tries=n_tries,
+ nb_knn=nb_knn,
+ train_features=train_features,
+ train_labels=train_labels,
+ )
+ postprocessors, metrics = {}, {}
+ for n_per_class, knn_module in knn_module_dict.items():
+ for t, knn_try in knn_module.items():
+ postprocessors = {
+ **postprocessors,
+ **{(n_per_class, t, k): DictKeysModule([n_per_class, t, k]) for k in knn_try.nb_knn},
+ }
+ metrics = {**metrics, **{(n_per_class, t, k): metric_collection.clone() for k in knn_try.nb_knn}}
+ model_with_knn = torch.nn.Sequential(model, knn_module_dict)
+
+ # ============ evaluation ... ============
+ logger.info("Start the k-NN classification.")
+ _, results_dict = evaluate(model_with_knn, val_dataloader, postprocessors, metrics, device)
+
+ # Averaging the results over the n tries for each value of n_per_class
+ for n_per_class, knn_module in knn_module_dict.items():
+ first_try = list(knn_module.keys())[0]
+ k_list = knn_module[first_try].nb_knn
+ for k in k_list:
+ keys = results_dict[(n_per_class, first_try, k)].keys() # keys are e.g. `top-1` and `top-5`
+ results_dict[(n_per_class, k)] = {
+ key: torch.mean(torch.stack([results_dict[(n_per_class, t, k)][key] for t in knn_module.keys()]))
+ for key in keys
+ }
+ for t in knn_module.keys():
+ del results_dict[(n_per_class, t, k)]
+
+ return results_dict
+
+
+def eval_knn_with_model(
+ model,
+ output_dir,
+ train_dataset_str="ImageNet:split=TRAIN",
+ val_dataset_str="ImageNet:split=VAL",
+ nb_knn=(10, 20, 100, 200),
+ temperature=0.07,
+ autocast_dtype=torch.float,
+ accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+ transform=None,
+ gather_on_cpu=False,
+ batch_size=256,
+ num_workers=5,
+ n_per_class_list=[-1],
+ n_tries=1,
+):
+ transform = transform or make_classification_eval_transform()
+
+ train_dataset = make_dataset(
+ dataset_str=train_dataset_str,
+ transform=transform,
+ )
+ val_dataset = make_dataset(
+ dataset_str=val_dataset_str,
+ transform=transform,
+ )
+
+ with torch.cuda.amp.autocast(dtype=autocast_dtype):
+ results_dict_knn = eval_knn(
+ model=model,
+ train_dataset=train_dataset,
+ val_dataset=val_dataset,
+ accuracy_averaging=accuracy_averaging,
+ nb_knn=nb_knn,
+ temperature=temperature,
+ batch_size=batch_size,
+ num_workers=num_workers,
+ gather_on_cpu=gather_on_cpu,
+ n_per_class_list=n_per_class_list,
+ n_tries=n_tries,
+ )
+
+ results_dict = {}
+ if distributed.is_main_process():
+ for knn_ in results_dict_knn.keys():
+ top1 = results_dict_knn[knn_]["top-1"].item() * 100.0
+ top5 = results_dict_knn[knn_]["top-5"].item() * 100.0
+ results_dict[f"{knn_} Top 1"] = top1
+ results_dict[f"{knn_} Top 5"] = top5
+ logger.info(f"{knn_} classifier result: Top1: {top1:.2f} Top5: {top5:.2f}")
+
+ metrics_file_path = os.path.join(output_dir, "results_eval_knn.json")
+ with open(metrics_file_path, "a") as f:
+ for k, v in results_dict.items():
+ f.write(json.dumps({k: v}) + "\n")
+
+ if distributed.is_enabled():
+ torch.distributed.barrier()
+ return results_dict
+
+
+def main(args):
+ model, autocast_dtype = setup_and_build_model(args)
+ eval_knn_with_model(
+ model=model,
+ output_dir=args.output_dir,
+ train_dataset_str=args.train_dataset_str,
+ val_dataset_str=args.val_dataset_str,
+ nb_knn=args.nb_knn,
+ temperature=args.temperature,
+ autocast_dtype=autocast_dtype,
+ accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+ transform=None,
+ gather_on_cpu=args.gather_on_cpu,
+ batch_size=args.batch_size,
+ num_workers=5,
+ n_per_class_list=args.n_per_class_list,
+ n_tries=args.n_tries,
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ description = "DINOv2 k-NN evaluation"
+ args_parser = get_args_parser(description=description)
+ args = args_parser.parse_args()
+ sys.exit(main(args))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d90fd89f3efa72f77ea0ec4e3c0cc9551227f8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
@@ -0,0 +1,626 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data import SamplerType, make_data_loader, make_dataset
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data.transforms import make_classification_eval_transform, make_classification_train_transform
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.metrics import MetricType, build_metric
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.setup import setup_and_build_model
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.utils import ModelWithIntermediateLayers, evaluate
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+ description: Optional[str] = None,
+ parents: Optional[List[argparse.ArgumentParser]] = None,
+ add_help: bool = True,
+):
+ parents = parents or []
+ setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+ parents = [setup_args_parser]
+ parser = argparse.ArgumentParser(
+ description=description,
+ parents=parents,
+ add_help=add_help,
+ )
+ parser.add_argument(
+ "--train-dataset",
+ dest="train_dataset_str",
+ type=str,
+ help="Training dataset",
+ )
+ parser.add_argument(
+ "--val-dataset",
+ dest="val_dataset_str",
+ type=str,
+ help="Validation dataset",
+ )
+ parser.add_argument(
+ "--test-datasets",
+ dest="test_dataset_strs",
+ type=str,
+ nargs="+",
+ help="Test datasets, none to reuse the validation dataset",
+ )
+ parser.add_argument(
+ "--epochs",
+ type=int,
+ help="Number of training epochs",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ help="Batch Size (per GPU)",
+ )
+ parser.add_argument(
+ "--num-workers",
+ type=int,
+ help="Number de Workers",
+ )
+ parser.add_argument(
+ "--epoch-length",
+ type=int,
+ help="Length of an epoch in number of iterations",
+ )
+ parser.add_argument(
+ "--save-checkpoint-frequency",
+ type=int,
+ help="Number of epochs between two named checkpoint saves.",
+ )
+ parser.add_argument(
+ "--eval-period-iterations",
+ type=int,
+ help="Number of iterations between two evaluations.",
+ )
+ parser.add_argument(
+ "--learning-rates",
+ nargs="+",
+ type=float,
+ help="Learning rates to grid search.",
+ )
+ parser.add_argument(
+ "--no-resume",
+ action="store_true",
+ help="Whether to not resume from existing checkpoints",
+ )
+ parser.add_argument(
+ "--val-metric-type",
+ type=MetricType,
+ choices=list(MetricType),
+ help="Validation metric",
+ )
+ parser.add_argument(
+ "--test-metric-types",
+ type=MetricType,
+ choices=list(MetricType),
+ nargs="+",
+ help="Evaluation metric",
+ )
+ parser.add_argument(
+ "--classifier-fpath",
+ type=str,
+ help="Path to a file containing pretrained linear classifiers",
+ )
+ parser.add_argument(
+ "--val-class-mapping-fpath",
+ type=str,
+ help="Path to a file containing a mapping to adjust classifier outputs",
+ )
+ parser.add_argument(
+ "--test-class-mapping-fpaths",
+ nargs="+",
+ type=str,
+ help="Path to a file containing a mapping to adjust classifier outputs",
+ )
+ parser.set_defaults(
+ train_dataset_str="ImageNet:split=TRAIN",
+ val_dataset_str="ImageNet:split=VAL",
+ test_dataset_strs=None,
+ epochs=10,
+ batch_size=128,
+ num_workers=8,
+ epoch_length=1250,
+ save_checkpoint_frequency=20,
+ eval_period_iterations=1250,
+ learning_rates=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1],
+ val_metric_type=MetricType.MEAN_ACCURACY,
+ test_metric_types=None,
+ classifier_fpath=None,
+ val_class_mapping_fpath=None,
+ test_class_mapping_fpaths=[None],
+ )
+ return parser
+
+
+def has_ddp_wrapper(m: nn.Module) -> bool:
+ return isinstance(m, DistributedDataParallel)
+
+
+def remove_ddp_wrapper(m: nn.Module) -> nn.Module:
+ return m.module if has_ddp_wrapper(m) else m
+
+
+def _pad_and_collate(batch):
+ maxlen = max(len(targets) for image, targets in batch)
+ padded_batch = [
+ (image, np.pad(targets, (0, maxlen - len(targets)), constant_values=-1)) for image, targets in batch
+ ]
+ return torch.utils.data.default_collate(padded_batch)
+
+
+def create_linear_input(x_tokens_list, use_n_blocks, use_avgpool):
+ intermediate_output = x_tokens_list[-use_n_blocks:]
+ output = torch.cat([class_token for _, class_token in intermediate_output], dim=-1)
+ if use_avgpool:
+ output = torch.cat(
+ (
+ output,
+ torch.mean(intermediate_output[-1][0], dim=1), # patch tokens
+ ),
+ dim=-1,
+ )
+ output = output.reshape(output.shape[0], -1)
+ return output.float()
+
+
+class LinearClassifier(nn.Module):
+ """Linear layer to train on top of frozen features"""
+
+ def __init__(self, out_dim, use_n_blocks, use_avgpool, num_classes=1000):
+ super().__init__()
+ self.out_dim = out_dim
+ self.use_n_blocks = use_n_blocks
+ self.use_avgpool = use_avgpool
+ self.num_classes = num_classes
+ self.linear = nn.Linear(out_dim, num_classes)
+ self.linear.weight.data.normal_(mean=0.0, std=0.01)
+ self.linear.bias.data.zero_()
+
+ def forward(self, x_tokens_list):
+ output = create_linear_input(x_tokens_list, self.use_n_blocks, self.use_avgpool)
+ return self.linear(output)
+
+
+class AllClassifiers(nn.Module):
+ def __init__(self, classifiers_dict):
+ super().__init__()
+ self.classifiers_dict = nn.ModuleDict()
+ self.classifiers_dict.update(classifiers_dict)
+
+ def forward(self, inputs):
+ return {k: v.forward(inputs) for k, v in self.classifiers_dict.items()}
+
+ def __len__(self):
+ return len(self.classifiers_dict)
+
+
+class LinearPostprocessor(nn.Module):
+ def __init__(self, linear_classifier, class_mapping=None):
+ super().__init__()
+ self.linear_classifier = linear_classifier
+ self.register_buffer("class_mapping", None if class_mapping is None else torch.LongTensor(class_mapping))
+
+ def forward(self, samples, targets):
+ preds = self.linear_classifier(samples)
+ return {
+ "preds": preds[:, self.class_mapping] if self.class_mapping is not None else preds,
+ "target": targets,
+ }
+
+
+def scale_lr(learning_rates, batch_size):
+ return learning_rates * (batch_size * distributed.get_global_size()) / 256.0
+
+
+def setup_linear_classifiers(sample_output, n_last_blocks_list, learning_rates, batch_size, num_classes=1000):
+ linear_classifiers_dict = nn.ModuleDict()
+ optim_param_groups = []
+ for n in n_last_blocks_list:
+ for avgpool in [False, True]:
+ for _lr in learning_rates:
+ lr = scale_lr(_lr, batch_size)
+ out_dim = create_linear_input(sample_output, use_n_blocks=n, use_avgpool=avgpool).shape[1]
+ linear_classifier = LinearClassifier(
+ out_dim, use_n_blocks=n, use_avgpool=avgpool, num_classes=num_classes
+ )
+ linear_classifier = linear_classifier.cuda()
+ linear_classifiers_dict[
+ f"classifier_{n}_blocks_avgpool_{avgpool}_lr_{lr:.5f}".replace(".", "_")
+ ] = linear_classifier
+ optim_param_groups.append({"params": linear_classifier.parameters(), "lr": lr})
+
+ linear_classifiers = AllClassifiers(linear_classifiers_dict)
+ if distributed.is_enabled():
+ linear_classifiers = nn.parallel.DistributedDataParallel(linear_classifiers)
+
+ return linear_classifiers, optim_param_groups
+
+
+@torch.no_grad()
+def evaluate_linear_classifiers(
+ feature_model,
+ linear_classifiers,
+ data_loader,
+ metric_type,
+ metrics_file_path,
+ training_num_classes,
+ iteration,
+ prefixstring="",
+ class_mapping=None,
+ best_classifier_on_val=None,
+):
+ logger.info("running validation !")
+
+ num_classes = len(class_mapping) if class_mapping is not None else training_num_classes
+ metric = build_metric(metric_type, num_classes=num_classes)
+ postprocessors = {k: LinearPostprocessor(v, class_mapping) for k, v in linear_classifiers.classifiers_dict.items()}
+ metrics = {k: metric.clone() for k in linear_classifiers.classifiers_dict}
+
+ _, results_dict_temp = evaluate(
+ feature_model,
+ data_loader,
+ postprocessors,
+ metrics,
+ torch.cuda.current_device(),
+ )
+
+ logger.info("")
+ results_dict = {}
+ max_accuracy = 0
+ best_classifier = ""
+ for i, (classifier_string, metric) in enumerate(results_dict_temp.items()):
+ logger.info(f"{prefixstring} -- Classifier: {classifier_string} * {metric}")
+ if (
+ best_classifier_on_val is None and metric["top-1"].item() > max_accuracy
+ ) or classifier_string == best_classifier_on_val:
+ max_accuracy = metric["top-1"].item()
+ best_classifier = classifier_string
+
+ results_dict["best_classifier"] = {"name": best_classifier, "accuracy": max_accuracy}
+
+ logger.info(f"best classifier: {results_dict['best_classifier']}")
+
+ if distributed.is_main_process():
+ with open(metrics_file_path, "a") as f:
+ f.write(f"iter: {iteration}\n")
+ for k, v in results_dict.items():
+ f.write(json.dumps({k: v}) + "\n")
+ f.write("\n")
+
+ return results_dict
+
+
+def eval_linear(
+ *,
+ feature_model,
+ linear_classifiers,
+ train_data_loader,
+ val_data_loader,
+ metrics_file_path,
+ optimizer,
+ scheduler,
+ output_dir,
+ max_iter,
+ checkpoint_period, # In number of iter, creates a new file every period
+ running_checkpoint_period, # Period to update main checkpoint file
+ eval_period,
+ metric_type,
+ training_num_classes,
+ resume=True,
+ classifier_fpath=None,
+ val_class_mapping=None,
+):
+ checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+ start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+
+ periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter)
+ iteration = start_iter
+ logger.info("Starting training from iteration {}".format(start_iter))
+ metric_logger = MetricLogger(delimiter=" ")
+ header = "Training"
+
+ for data, labels in metric_logger.log_every(
+ train_data_loader,
+ 10,
+ header,
+ max_iter,
+ start_iter,
+ ):
+ data = data.cuda(non_blocking=True)
+ labels = labels.cuda(non_blocking=True)
+
+ features = feature_model(data)
+ outputs = linear_classifiers(features)
+
+ losses = {f"loss_{k}": nn.CrossEntropyLoss()(v, labels) for k, v in outputs.items()}
+ loss = sum(losses.values())
+
+ # compute the gradients
+ optimizer.zero_grad()
+ loss.backward()
+
+ # step
+ optimizer.step()
+ scheduler.step()
+
+ # log
+ if iteration % 10 == 0:
+ torch.cuda.synchronize()
+ metric_logger.update(loss=loss.item())
+ metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+ print("lr", optimizer.param_groups[0]["lr"])
+
+ if iteration - start_iter > 5:
+ if iteration % running_checkpoint_period == 0:
+ torch.cuda.synchronize()
+ if distributed.is_main_process():
+ logger.info("Checkpointing running_checkpoint")
+ periodic_checkpointer.save("running_checkpoint_linear_eval", iteration=iteration)
+ torch.cuda.synchronize()
+ periodic_checkpointer.step(iteration)
+
+ if eval_period > 0 and (iteration + 1) % eval_period == 0 and iteration != max_iter - 1:
+ _ = evaluate_linear_classifiers(
+ feature_model=feature_model,
+ linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+ data_loader=val_data_loader,
+ metrics_file_path=metrics_file_path,
+ prefixstring=f"ITER: {iteration}",
+ metric_type=metric_type,
+ training_num_classes=training_num_classes,
+ iteration=iteration,
+ class_mapping=val_class_mapping,
+ )
+ torch.cuda.synchronize()
+
+ iteration = iteration + 1
+
+ val_results_dict = evaluate_linear_classifiers(
+ feature_model=feature_model,
+ linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+ data_loader=val_data_loader,
+ metrics_file_path=metrics_file_path,
+ metric_type=metric_type,
+ training_num_classes=training_num_classes,
+ iteration=iteration,
+ class_mapping=val_class_mapping,
+ )
+ return val_results_dict, feature_model, linear_classifiers, iteration
+
+
+def make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type):
+ test_dataset = make_dataset(
+ dataset_str=test_dataset_str,
+ transform=make_classification_eval_transform(),
+ )
+ test_data_loader = make_data_loader(
+ dataset=test_dataset,
+ batch_size=batch_size,
+ num_workers=num_workers,
+ sampler_type=SamplerType.DISTRIBUTED,
+ drop_last=False,
+ shuffle=False,
+ persistent_workers=False,
+ collate_fn=_pad_and_collate if metric_type == MetricType.IMAGENET_REAL_ACCURACY else None,
+ )
+ return test_data_loader
+
+
+def test_on_datasets(
+ feature_model,
+ linear_classifiers,
+ test_dataset_strs,
+ batch_size,
+ num_workers,
+ test_metric_types,
+ metrics_file_path,
+ training_num_classes,
+ iteration,
+ best_classifier_on_val,
+ prefixstring="",
+ test_class_mappings=[None],
+):
+ results_dict = {}
+ for test_dataset_str, class_mapping, metric_type in zip(test_dataset_strs, test_class_mappings, test_metric_types):
+ logger.info(f"Testing on {test_dataset_str}")
+ test_data_loader = make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type)
+ dataset_results_dict = evaluate_linear_classifiers(
+ feature_model,
+ remove_ddp_wrapper(linear_classifiers),
+ test_data_loader,
+ metric_type,
+ metrics_file_path,
+ training_num_classes,
+ iteration,
+ prefixstring="",
+ class_mapping=class_mapping,
+ best_classifier_on_val=best_classifier_on_val,
+ )
+ results_dict[f"{test_dataset_str}_accuracy"] = 100.0 * dataset_results_dict["best_classifier"]["accuracy"]
+ return results_dict
+
+
+def run_eval_linear(
+ model,
+ output_dir,
+ train_dataset_str,
+ val_dataset_str,
+ batch_size,
+ epochs,
+ epoch_length,
+ num_workers,
+ save_checkpoint_frequency,
+ eval_period_iterations,
+ learning_rates,
+ autocast_dtype,
+ test_dataset_strs=None,
+ resume=True,
+ classifier_fpath=None,
+ val_class_mapping_fpath=None,
+ test_class_mapping_fpaths=[None],
+ val_metric_type=MetricType.MEAN_ACCURACY,
+ test_metric_types=None,
+):
+ seed = 0
+
+ if test_dataset_strs is None:
+ test_dataset_strs = [val_dataset_str]
+ if test_metric_types is None:
+ test_metric_types = [val_metric_type] * len(test_dataset_strs)
+ else:
+ assert len(test_metric_types) == len(test_dataset_strs)
+ assert len(test_dataset_strs) == len(test_class_mapping_fpaths)
+
+ train_transform = make_classification_train_transform()
+ train_dataset = make_dataset(
+ dataset_str=train_dataset_str,
+ transform=train_transform,
+ )
+ training_num_classes = len(torch.unique(torch.Tensor(train_dataset.get_targets().astype(int))))
+ sampler_type = SamplerType.SHARDED_INFINITE
+ # sampler_type = SamplerType.INFINITE
+
+ n_last_blocks_list = [1, 4]
+ n_last_blocks = max(n_last_blocks_list)
+ autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=autocast_dtype)
+ feature_model = ModelWithIntermediateLayers(model, n_last_blocks, autocast_ctx)
+ sample_output = feature_model(train_dataset[0][0].unsqueeze(0).cuda())
+
+ linear_classifiers, optim_param_groups = setup_linear_classifiers(
+ sample_output,
+ n_last_blocks_list,
+ learning_rates,
+ batch_size,
+ training_num_classes,
+ )
+
+ optimizer = torch.optim.SGD(optim_param_groups, momentum=0.9, weight_decay=0)
+ max_iter = epochs * epoch_length
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter, eta_min=0)
+ checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+ start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+ train_data_loader = make_data_loader(
+ dataset=train_dataset,
+ batch_size=batch_size,
+ num_workers=num_workers,
+ shuffle=True,
+ seed=seed,
+ sampler_type=sampler_type,
+ sampler_advance=start_iter,
+ drop_last=True,
+ persistent_workers=True,
+ )
+ val_data_loader = make_eval_data_loader(val_dataset_str, batch_size, num_workers, val_metric_type)
+
+ checkpoint_period = save_checkpoint_frequency * epoch_length
+
+ if val_class_mapping_fpath is not None:
+ logger.info(f"Using class mapping from {val_class_mapping_fpath}")
+ val_class_mapping = np.load(val_class_mapping_fpath)
+ else:
+ val_class_mapping = None
+
+ test_class_mappings = []
+ for class_mapping_fpath in test_class_mapping_fpaths:
+ if class_mapping_fpath is not None and class_mapping_fpath != "None":
+ logger.info(f"Using class mapping from {class_mapping_fpath}")
+ class_mapping = np.load(class_mapping_fpath)
+ else:
+ class_mapping = None
+ test_class_mappings.append(class_mapping)
+
+ metrics_file_path = os.path.join(output_dir, "results_eval_linear.json")
+ val_results_dict, feature_model, linear_classifiers, iteration = eval_linear(
+ feature_model=feature_model,
+ linear_classifiers=linear_classifiers,
+ train_data_loader=train_data_loader,
+ val_data_loader=val_data_loader,
+ metrics_file_path=metrics_file_path,
+ optimizer=optimizer,
+ scheduler=scheduler,
+ output_dir=output_dir,
+ max_iter=max_iter,
+ checkpoint_period=checkpoint_period,
+ running_checkpoint_period=epoch_length,
+ eval_period=eval_period_iterations,
+ metric_type=val_metric_type,
+ training_num_classes=training_num_classes,
+ resume=resume,
+ val_class_mapping=val_class_mapping,
+ classifier_fpath=classifier_fpath,
+ )
+ results_dict = {}
+ if len(test_dataset_strs) > 1 or test_dataset_strs[0] != val_dataset_str:
+ results_dict = test_on_datasets(
+ feature_model,
+ linear_classifiers,
+ test_dataset_strs,
+ batch_size,
+ 0, # num_workers,
+ test_metric_types,
+ metrics_file_path,
+ training_num_classes,
+ iteration,
+ val_results_dict["best_classifier"]["name"],
+ prefixstring="",
+ test_class_mappings=test_class_mappings,
+ )
+ results_dict["best_classifier"] = val_results_dict["best_classifier"]["name"]
+ results_dict[f"{val_dataset_str}_accuracy"] = 100.0 * val_results_dict["best_classifier"]["accuracy"]
+ logger.info("Test Results Dict " + str(results_dict))
+
+ return results_dict
+
+
+def main(args):
+ model, autocast_dtype = setup_and_build_model(args)
+ run_eval_linear(
+ model=model,
+ output_dir=args.output_dir,
+ train_dataset_str=args.train_dataset_str,
+ val_dataset_str=args.val_dataset_str,
+ test_dataset_strs=args.test_dataset_strs,
+ batch_size=args.batch_size,
+ epochs=args.epochs,
+ epoch_length=args.epoch_length,
+ num_workers=args.num_workers,
+ save_checkpoint_frequency=args.save_checkpoint_frequency,
+ eval_period_iterations=args.eval_period_iterations,
+ learning_rates=args.learning_rates,
+ autocast_dtype=autocast_dtype,
+ resume=not args.no_resume,
+ classifier_fpath=args.classifier_fpath,
+ val_metric_type=args.val_metric_type,
+ test_metric_types=args.test_metric_types,
+ val_class_mapping_fpath=args.val_class_mapping_fpath,
+ test_class_mapping_fpaths=args.test_class_mapping_fpaths,
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ description = "DINOv2 linear evaluation"
+ args_parser = get_args_parser(description=description)
+ args = args_parser.parse_args()
+ sys.exit(main(args))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..634c74cee3c66bf2a90dc1e16518f239367937ec
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
@@ -0,0 +1,445 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import gc
+import logging
+import sys
+import time
+from typing import List, Optional
+
+from cuml.linear_model import LogisticRegression
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed
+from torch import nn
+from torch.utils.data import TensorDataset
+from torchmetrics import MetricTracker
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data import make_dataset
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data.transforms import make_classification_eval_transform
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed import get_global_rank, get_global_size
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.metrics import MetricType, build_metric
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.setup import setup_and_build_model
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.utils import evaluate, extract_features
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.dtype import as_torch_dtype
+
+
+logger = logging.getLogger("dinov2")
+
+DEFAULT_MAX_ITER = 1_000
+C_POWER_RANGE = torch.linspace(-6, 5, 45)
+_CPU_DEVICE = torch.device("cpu")
+
+
+def get_args_parser(
+ description: Optional[str] = None,
+ parents: Optional[List[argparse.ArgumentParser]] = None,
+ add_help: bool = True,
+):
+ parents = parents or []
+ setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+ parents = [setup_args_parser]
+ parser = argparse.ArgumentParser(
+ description=description,
+ parents=parents,
+ add_help=add_help,
+ )
+ parser.add_argument(
+ "--train-dataset",
+ dest="train_dataset_str",
+ type=str,
+ help="Training dataset",
+ )
+ parser.add_argument(
+ "--val-dataset",
+ dest="val_dataset_str",
+ type=str,
+ help="Validation dataset",
+ )
+ parser.add_argument(
+ "--finetune-dataset-str",
+ dest="finetune_dataset_str",
+ type=str,
+ help="Fine-tuning dataset",
+ )
+ parser.add_argument(
+ "--finetune-on-val",
+ action="store_true",
+ help="If there is no finetune dataset, whether to choose the "
+ "hyperparameters on the val set instead of 10%% of the train dataset",
+ )
+ parser.add_argument(
+ "--metric-type",
+ type=MetricType,
+ choices=list(MetricType),
+ help="Metric type",
+ )
+ parser.add_argument(
+ "--train-features-device",
+ type=str,
+ help="Device to gather train features (cpu, cuda, cuda:0, etc.), default: %(default)s",
+ )
+ parser.add_argument(
+ "--train-dtype",
+ type=str,
+ help="Data type to convert the train features to (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--max-train-iters",
+ type=int,
+ help="Maximum number of train iterations (default: %(default)s)",
+ )
+ parser.set_defaults(
+ train_dataset_str="ImageNet:split=TRAIN",
+ val_dataset_str="ImageNet:split=VAL",
+ finetune_dataset_str=None,
+ metric_type=MetricType.MEAN_ACCURACY,
+ train_features_device="cpu",
+ train_dtype="float64",
+ max_train_iters=DEFAULT_MAX_ITER,
+ finetune_on_val=False,
+ )
+ return parser
+
+
+class LogRegModule(nn.Module):
+ def __init__(
+ self,
+ C,
+ max_iter=DEFAULT_MAX_ITER,
+ dtype=torch.float64,
+ device=_CPU_DEVICE,
+ ):
+ super().__init__()
+ self.dtype = dtype
+ self.device = device
+ self.estimator = LogisticRegression(
+ penalty="l2",
+ C=C,
+ max_iter=max_iter,
+ output_type="numpy",
+ tol=1e-12,
+ linesearch_max_iter=50,
+ )
+
+ def forward(self, samples, targets):
+ samples_device = samples.device
+ samples = samples.to(dtype=self.dtype, device=self.device)
+ if self.device == _CPU_DEVICE:
+ samples = samples.numpy()
+ probas = self.estimator.predict_proba(samples)
+ return {"preds": torch.from_numpy(probas).to(samples_device), "target": targets}
+
+ def fit(self, train_features, train_labels):
+ train_features = train_features.to(dtype=self.dtype, device=self.device)
+ train_labels = train_labels.to(dtype=self.dtype, device=self.device)
+ if self.device == _CPU_DEVICE:
+ # both cuML and sklearn only work with numpy arrays on CPU
+ train_features = train_features.numpy()
+ train_labels = train_labels.numpy()
+ self.estimator.fit(train_features, train_labels)
+
+
+def evaluate_model(*, logreg_model, logreg_metric, test_data_loader, device):
+ postprocessors = {"metrics": logreg_model}
+ metrics = {"metrics": logreg_metric}
+ return evaluate(nn.Identity(), test_data_loader, postprocessors, metrics, device)
+
+
+def train_for_C(*, C, max_iter, train_features, train_labels, dtype=torch.float64, device=_CPU_DEVICE):
+ logreg_model = LogRegModule(C, max_iter=max_iter, dtype=dtype, device=device)
+ logreg_model.fit(train_features, train_labels)
+ return logreg_model
+
+
+def train_and_evaluate(
+ *,
+ C,
+ max_iter,
+ train_features,
+ train_labels,
+ logreg_metric,
+ test_data_loader,
+ train_dtype=torch.float64,
+ train_features_device,
+ eval_device,
+):
+ logreg_model = train_for_C(
+ C=C,
+ max_iter=max_iter,
+ train_features=train_features,
+ train_labels=train_labels,
+ dtype=train_dtype,
+ device=train_features_device,
+ )
+ return evaluate_model(
+ logreg_model=logreg_model,
+ logreg_metric=logreg_metric,
+ test_data_loader=test_data_loader,
+ device=eval_device,
+ )
+
+
+def sweep_C_values(
+ *,
+ train_features,
+ train_labels,
+ test_data_loader,
+ metric_type,
+ num_classes,
+ train_dtype=torch.float64,
+ train_features_device=_CPU_DEVICE,
+ max_train_iters=DEFAULT_MAX_ITER,
+):
+ if metric_type == MetricType.PER_CLASS_ACCURACY:
+ # If we want to output per-class accuracy, we select the hyperparameters with mean per class
+ metric_type = MetricType.MEAN_PER_CLASS_ACCURACY
+ logreg_metric = build_metric(metric_type, num_classes=num_classes)
+ metric_tracker = MetricTracker(logreg_metric, maximize=True)
+ ALL_C = 10**C_POWER_RANGE
+ logreg_models = {}
+
+ train_features = train_features.to(dtype=train_dtype, device=train_features_device)
+ train_labels = train_labels.to(device=train_features_device)
+
+ for i in range(get_global_rank(), len(ALL_C), get_global_size()):
+ C = ALL_C[i].item()
+ logger.info(
+ f"Training for C = {C:.5f}, dtype={train_dtype}, "
+ f"features: {train_features.shape}, {train_features.dtype}, "
+ f"labels: {train_labels.shape}, {train_labels.dtype}"
+ )
+ logreg_models[C] = train_for_C(
+ C=C,
+ max_iter=max_train_iters,
+ train_features=train_features,
+ train_labels=train_labels,
+ dtype=train_dtype,
+ device=train_features_device,
+ )
+
+ gather_list = [None for _ in range(get_global_size())]
+ torch.distributed.all_gather_object(gather_list, logreg_models)
+
+ logreg_models_gathered = {}
+ for logreg_dict in gather_list:
+ logreg_models_gathered.update(logreg_dict)
+
+ for i in range(len(ALL_C)):
+ metric_tracker.increment()
+ C = ALL_C[i].item()
+ evals = evaluate_model(
+ logreg_model=logreg_models_gathered[C],
+ logreg_metric=metric_tracker,
+ test_data_loader=test_data_loader,
+ device=torch.cuda.current_device(),
+ )
+ logger.info(f"Trained for C = {C:.5f}, accuracies = {evals}")
+
+ best_stats, which_epoch = metric_tracker.best_metric(return_step=True)
+ best_stats_100 = {k: 100.0 * v for k, v in best_stats.items()}
+ if which_epoch["top-1"] == i:
+ best_C = C
+ logger.info(f"Sweep best {best_stats_100}, best C = {best_C:.6f}")
+
+ return best_stats, best_C
+
+
+def eval_log_regression(
+ *,
+ model,
+ train_dataset,
+ val_dataset,
+ finetune_dataset,
+ metric_type,
+ batch_size,
+ num_workers,
+ finetune_on_val=False,
+ train_dtype=torch.float64,
+ train_features_device=_CPU_DEVICE,
+ max_train_iters=DEFAULT_MAX_ITER,
+):
+ """
+ Implements the "standard" process for log regression evaluation:
+ The value of C is chosen by training on train_dataset and evaluating on
+ finetune_dataset. Then, the final model is trained on a concatenation of
+ train_dataset and finetune_dataset, and is evaluated on val_dataset.
+ If there is no finetune_dataset, the value of C is the one that yields
+ the best results on a random 10% subset of the train dataset
+ """
+
+ start = time.time()
+
+ train_features, train_labels = extract_features(
+ model, train_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+ )
+ val_features, val_labels = extract_features(
+ model, val_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+ )
+ val_data_loader = torch.utils.data.DataLoader(
+ TensorDataset(val_features, val_labels),
+ batch_size=batch_size,
+ drop_last=False,
+ num_workers=0,
+ persistent_workers=False,
+ )
+
+ if finetune_dataset is None and finetune_on_val:
+ logger.info("Choosing hyperparameters on the val dataset")
+ finetune_features, finetune_labels = val_features, val_labels
+ elif finetune_dataset is None and not finetune_on_val:
+ logger.info("Choosing hyperparameters on 10% of the train dataset")
+ torch.manual_seed(0)
+ indices = torch.randperm(len(train_features), device=train_features.device)
+ finetune_index = indices[: len(train_features) // 10]
+ train_index = indices[len(train_features) // 10 :]
+ finetune_features, finetune_labels = train_features[finetune_index], train_labels[finetune_index]
+ train_features, train_labels = train_features[train_index], train_labels[train_index]
+ else:
+ logger.info("Choosing hyperparameters on the finetune dataset")
+ finetune_features, finetune_labels = extract_features(
+ model, finetune_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+ )
+ # release the model - free GPU memory
+ del model
+ gc.collect()
+ torch.cuda.empty_cache()
+ finetune_data_loader = torch.utils.data.DataLoader(
+ TensorDataset(finetune_features, finetune_labels),
+ batch_size=batch_size,
+ drop_last=False,
+ )
+
+ if len(train_labels.shape) > 1:
+ num_classes = train_labels.shape[1]
+ else:
+ num_classes = train_labels.max() + 1
+
+ logger.info("Using cuML for logistic regression")
+
+ best_stats, best_C = sweep_C_values(
+ train_features=train_features,
+ train_labels=train_labels,
+ test_data_loader=finetune_data_loader,
+ metric_type=metric_type,
+ num_classes=num_classes,
+ train_dtype=train_dtype,
+ train_features_device=train_features_device,
+ max_train_iters=max_train_iters,
+ )
+
+ if not finetune_on_val:
+ logger.info("Best parameter found, concatenating features")
+ train_features = torch.cat((train_features, finetune_features))
+ train_labels = torch.cat((train_labels, finetune_labels))
+
+ logger.info("Training final model")
+ logreg_metric = build_metric(metric_type, num_classes=num_classes)
+ evals = train_and_evaluate(
+ C=best_C,
+ max_iter=max_train_iters,
+ train_features=train_features,
+ train_labels=train_labels,
+ logreg_metric=logreg_metric.clone(),
+ test_data_loader=val_data_loader,
+ eval_device=torch.cuda.current_device(),
+ train_dtype=train_dtype,
+ train_features_device=train_features_device,
+ )
+
+ best_stats = evals[1]["metrics"]
+
+ best_stats["best_C"] = best_C
+
+ logger.info(f"Log regression evaluation done in {int(time.time() - start)}s")
+ return best_stats
+
+
+def eval_log_regression_with_model(
+ model,
+ train_dataset_str="ImageNet:split=TRAIN",
+ val_dataset_str="ImageNet:split=VAL",
+ finetune_dataset_str=None,
+ autocast_dtype=torch.float,
+ finetune_on_val=False,
+ metric_type=MetricType.MEAN_ACCURACY,
+ train_dtype=torch.float64,
+ train_features_device=_CPU_DEVICE,
+ max_train_iters=DEFAULT_MAX_ITER,
+):
+ cudnn.benchmark = True
+
+ transform = make_classification_eval_transform(resize_size=224)
+ target_transform = None
+
+ train_dataset = make_dataset(dataset_str=train_dataset_str, transform=transform, target_transform=target_transform)
+ val_dataset = make_dataset(dataset_str=val_dataset_str, transform=transform, target_transform=target_transform)
+ if finetune_dataset_str is not None:
+ finetune_dataset = make_dataset(
+ dataset_str=finetune_dataset_str, transform=transform, target_transform=target_transform
+ )
+ else:
+ finetune_dataset = None
+
+ with torch.cuda.amp.autocast(dtype=autocast_dtype):
+ results_dict_logreg = eval_log_regression(
+ model=model,
+ train_dataset=train_dataset,
+ val_dataset=val_dataset,
+ finetune_dataset=finetune_dataset,
+ metric_type=metric_type,
+ batch_size=256,
+ num_workers=0, # 5,
+ finetune_on_val=finetune_on_val,
+ train_dtype=train_dtype,
+ train_features_device=train_features_device,
+ max_train_iters=max_train_iters,
+ )
+
+ results_dict = {
+ "top-1": results_dict_logreg["top-1"].cpu().numpy() * 100.0,
+ "top-5": results_dict_logreg.get("top-5", torch.tensor(0.0)).cpu().numpy() * 100.0,
+ "best_C": results_dict_logreg["best_C"],
+ }
+ logger.info(
+ "\n".join(
+ [
+ "Training of the supervised logistic regression on frozen features completed.\n"
+ "Top-1 test accuracy: {acc:.1f}".format(acc=results_dict["top-1"]),
+ "Top-5 test accuracy: {acc:.1f}".format(acc=results_dict["top-5"]),
+ "obtained for C = {c:.6f}".format(c=results_dict["best_C"]),
+ ]
+ )
+ )
+
+ torch.distributed.barrier()
+ return results_dict
+
+
+def main(args):
+ model, autocast_dtype = setup_and_build_model(args)
+ eval_log_regression_with_model(
+ model=model,
+ train_dataset_str=args.train_dataset_str,
+ val_dataset_str=args.val_dataset_str,
+ finetune_dataset_str=args.finetune_dataset_str,
+ autocast_dtype=autocast_dtype,
+ finetune_on_val=args.finetune_on_val,
+ metric_type=args.metric_type,
+ train_dtype=as_torch_dtype(args.train_dtype),
+ train_features_device=torch.device(args.train_features_device),
+ max_train_iters=args.max_train_iters,
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ description = "DINOv2 logistic regression evaluation"
+ args_parser = get_args_parser(description=description)
+ args = args_parser.parse_args()
+ sys.exit(main(args))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..be928cd775958caec3a3fa4a35769d91eff0ce04
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import logging
+from typing import Any, Dict, Optional
+
+import torch
+from torch import Tensor
+from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchmetrics.utilities.data import dim_zero_cat, select_topk
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricType(Enum):
+ MEAN_ACCURACY = "mean_accuracy"
+ MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy"
+ PER_CLASS_ACCURACY = "per_class_accuracy"
+ IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy"
+
+ @property
+ def accuracy_averaging(self):
+ return getattr(AccuracyAveraging, self.name, None)
+
+ def __str__(self):
+ return self.value
+
+
+class AccuracyAveraging(Enum):
+ MEAN_ACCURACY = "micro"
+ MEAN_PER_CLASS_ACCURACY = "macro"
+ PER_CLASS_ACCURACY = "none"
+
+ def __str__(self):
+ return self.value
+
+
+def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None):
+ if metric_type.accuracy_averaging is not None:
+ return build_topk_accuracy_metric(
+ average_type=metric_type.accuracy_averaging,
+ num_classes=num_classes,
+ ks=(1, 5) if ks is None else ks,
+ )
+ elif metric_type == MetricType.IMAGENET_REAL_ACCURACY:
+ return build_topk_imagenet_real_accuracy_metric(
+ num_classes=num_classes,
+ ks=(1, 5) if ks is None else ks,
+ )
+
+ raise ValueError(f"Unknown metric type {metric_type}")
+
+
+def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)):
+ metrics: Dict[str, Metric] = {
+ f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks
+ }
+ return MetricCollection(metrics)
+
+
+def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)):
+ metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks}
+ return MetricCollection(metrics)
+
+
+class ImageNetReaLAccuracy(Metric):
+ is_differentiable: bool = False
+ higher_is_better: Optional[bool] = None
+ full_state_update: bool = False
+
+ def __init__(
+ self,
+ num_classes: int,
+ top_k: int = 1,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__(**kwargs)
+ self.num_classes = num_classes
+ self.top_k = top_k
+ self.add_state("tp", [], dist_reduce_fx="cat")
+
+ def update(self, preds: Tensor, target: Tensor) -> None: # type: ignore
+ # preds [B, D]
+ # target [B, A]
+ # preds_oh [B, D] with 0 and 1
+ # select top K highest probabilities, use one hot representation
+ preds_oh = select_topk(preds, self.top_k)
+ # target_oh [B, D + 1] with 0 and 1
+ target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32)
+ target = target.long()
+ # for undefined targets (-1) use a fake value `num_classes`
+ target[target == -1] = self.num_classes
+ # fill targets, use one hot representation
+ target_oh.scatter_(1, target, 1)
+ # target_oh [B, D] (remove the fake target at index `num_classes`)
+ target_oh = target_oh[:, :-1]
+ # tp [B] with 0 and 1
+ tp = (preds_oh * target_oh == 1).sum(dim=1)
+ # at least one match between prediction and target
+ tp.clip_(max=1)
+ # ignore instances where no targets are defined
+ mask = target_oh.sum(dim=1) > 0
+ tp = tp[mask]
+ self.tp.append(tp) # type: ignore
+
+ def compute(self) -> Tensor:
+ tp = dim_zero_cat(self.tp) # type: ignore
+ return tp.float().mean()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..899970dc3d80195981be2335a7a7eba19ecfdab5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.models import build_model_from_cfg
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.config import setup
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.utils as dinov2_utils
+
+
+def get_args_parser(
+ description: Optional[str] = None,
+ parents: Optional[List[argparse.ArgumentParser]] = None,
+ add_help: bool = True,
+):
+ parser = argparse.ArgumentParser(
+ description=description,
+ parents=parents or [],
+ add_help=add_help,
+ )
+ parser.add_argument(
+ "--config-file",
+ type=str,
+ help="Model configuration file",
+ )
+ parser.add_argument(
+ "--pretrained-weights",
+ type=str,
+ help="Pretrained model weights",
+ )
+ parser.add_argument(
+ "--output-dir",
+ default="",
+ type=str,
+ help="Output directory to write results and logs",
+ )
+ parser.add_argument(
+ "--opts",
+ help="Extra configuration options",
+ default=[],
+ nargs="+",
+ )
+ return parser
+
+
+def get_autocast_dtype(config):
+ teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
+ if teacher_dtype_str == "fp16":
+ return torch.half
+ elif teacher_dtype_str == "bf16":
+ return torch.bfloat16
+ else:
+ return torch.float
+
+
+def build_model_for_eval(config, pretrained_weights):
+ model, _ = build_model_from_cfg(config, only_teacher=True)
+ dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
+ model.eval()
+ model.cuda()
+ return model
+
+
+def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
+ cudnn.benchmark = True
+ config = setup(args)
+ model = build_model_for_eval(config, args.pretrained_weights)
+ autocast_dtype = get_autocast_dtype(config)
+ return model, autocast_dtype
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..df070c3e0523d38650ce21c0c969f7a4d15e7bff
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+from torchmetrics import MetricCollection
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+class ModelWithNormalize(torch.nn.Module):
+ def __init__(self, model):
+ super().__init__()
+ self.model = model
+
+ def forward(self, samples):
+ return nn.functional.normalize(self.model(samples), dim=1, p=2)
+
+
+class ModelWithIntermediateLayers(nn.Module):
+ def __init__(self, feature_model, n_last_blocks, autocast_ctx):
+ super().__init__()
+ self.feature_model = feature_model
+ self.feature_model.eval()
+ self.n_last_blocks = n_last_blocks
+ self.autocast_ctx = autocast_ctx
+
+ def forward(self, images):
+ with torch.inference_mode():
+ with self.autocast_ctx():
+ features = self.feature_model.get_intermediate_layers(
+ images, self.n_last_blocks, return_class_token=True
+ )
+ return features
+
+
+@torch.inference_mode()
+def evaluate(
+ model: nn.Module,
+ data_loader,
+ postprocessors: Dict[str, nn.Module],
+ metrics: Dict[str, MetricCollection],
+ device: torch.device,
+ criterion: Optional[nn.Module] = None,
+):
+ model.eval()
+ if criterion is not None:
+ criterion.eval()
+
+ for metric in metrics.values():
+ metric = metric.to(device)
+
+ metric_logger = MetricLogger(delimiter=" ")
+ header = "Test:"
+
+ for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header):
+ outputs = model(samples.to(device))
+ targets = targets.to(device)
+
+ if criterion is not None:
+ loss = criterion(outputs, targets)
+ metric_logger.update(loss=loss.item())
+
+ for k, metric in metrics.items():
+ metric_inputs = postprocessors[k](outputs, targets)
+ metric.update(**metric_inputs)
+
+ metric_logger.synchronize_between_processes()
+ logger.info(f"Averaged stats: {metric_logger}")
+
+ stats = {k: metric.compute() for k, metric in metrics.items()}
+ metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+ return metric_logger_stats, stats
+
+
+def all_gather_and_flatten(tensor_rank):
+ tensor_all_ranks = torch.empty(
+ distributed.get_global_size(),
+ *tensor_rank.shape,
+ dtype=tensor_rank.dtype,
+ device=tensor_rank.device,
+ )
+ tensor_list = list(tensor_all_ranks.unbind(0))
+ torch.distributed.all_gather(tensor_list, tensor_rank.contiguous())
+ return tensor_all_ranks.flatten(end_dim=1)
+
+
+def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False):
+ dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset)
+ sample_count = len(dataset_with_enumerated_targets)
+ data_loader = make_data_loader(
+ dataset=dataset_with_enumerated_targets,
+ batch_size=batch_size,
+ num_workers=num_workers,
+ sampler_type=SamplerType.DISTRIBUTED,
+ drop_last=False,
+ shuffle=False,
+ )
+ return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu)
+
+
+@torch.inference_mode()
+def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False):
+ gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda")
+ metric_logger = MetricLogger(delimiter=" ")
+ features, all_labels = None, None
+ for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10):
+ samples = samples.cuda(non_blocking=True)
+ labels_rank = labels_rank.cuda(non_blocking=True)
+ index = index.cuda(non_blocking=True)
+ features_rank = model(samples).float()
+
+ # init storage feature matrix
+ if features is None:
+ features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device)
+ labels_shape = list(labels_rank.shape)
+ labels_shape[0] = sample_count
+ all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device)
+ logger.info(f"Storing features into tensor of shape {features.shape}")
+
+ # share indexes, features and labels between processes
+ index_all = all_gather_and_flatten(index).to(gather_device)
+ features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device)
+ labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device)
+
+ # update storage feature matrix
+ if len(index_all) > 0:
+ features.index_copy_(0, index_all, features_all_ranks)
+ all_labels.index_copy_(0, index_all, labels_all_ranks)
+
+ logger.info(f"Features shape: {tuple(features.shape)}")
+ logger.info(f"Labels shape: {tuple(all_labels.shape)}")
+
+ assert torch.all(all_labels > -1)
+
+ return features, all_labels
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7532e88dbaa6839353ec82be8ac2d0bc77ff9caa
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any
+
+import torch
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from functools import partial
+from fvcore.common.checkpoint import Checkpointer
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp._runtime_utils import _reshard
+
+
+def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()):
+ sharding_strategy_dict = {
+ "NO_SHARD": ShardingStrategy.NO_SHARD,
+ "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP,
+ "FULL_SHARD": ShardingStrategy.FULL_SHARD,
+ }
+
+ dtype_dict = {
+ "fp32": torch.float32,
+ "fp16": torch.float16,
+ "bf16": torch.bfloat16,
+ }
+
+ mixed_precision_config = MixedPrecision(
+ param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype],
+ reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype],
+ buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype],
+ )
+
+ sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy]
+
+ local_rank = distributed.get_local_rank()
+
+ fsdp_wrapper = partial(
+ FSDP,
+ sharding_strategy=sharding_strategy_config,
+ mixed_precision=mixed_precision_config,
+ device_id=local_rank,
+ sync_module_states=True,
+ use_orig_params=True,
+ auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap),
+ )
+ return fsdp_wrapper
+
+
+def is_fsdp(x):
+ return isinstance(x, FSDP)
+
+
+def is_sharded_fsdp(x):
+ return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD
+
+
+def free_if_fsdp(x):
+ if is_sharded_fsdp(x):
+ handles = x._handles
+ true_list = [True for h in handles]
+ _reshard(x, handles, true_list)
+
+
+def get_fsdp_modules(x):
+ return FSDP.fsdp_modules(x)
+
+
+def reshard_fsdp_model(x):
+ for m in get_fsdp_modules(x):
+ free_if_fsdp(m)
+
+
+def rankstr():
+ return f"rank_{distributed.get_global_rank()}"
+
+
+class FSDPCheckpointer(Checkpointer):
+ def save(self, name: str, **kwargs: Any) -> None:
+ """
+ Dump model and checkpointables to a file.
+
+ Args:
+ name (str): name of the file.
+ kwargs (dict): extra arbitrary data to save.
+ """
+ if not self.save_dir or not self.save_to_disk:
+ return
+
+ data = {}
+ with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+ data["model"] = self.model.state_dict()
+
+ # data["model"] = self.model.state_dict()
+ for key, obj in self.checkpointables.items():
+ data[key] = obj.state_dict()
+ data.update(kwargs)
+
+ basename = f"{name}.{rankstr()}.pth"
+ save_file = os.path.join(self.save_dir, basename)
+ assert os.path.basename(save_file) == basename, basename
+ self.logger.info("Saving checkpoint to {}".format(save_file))
+ with self.path_manager.open(save_file, "wb") as f:
+ torch.save(data, f)
+ self.tag_last_checkpoint(basename)
+
+ def load(self, *args, **kwargs):
+ with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+ return super().load(*args, **kwargs)
+
+ def has_checkpoint(self) -> bool:
+ """
+ Returns:
+ bool: whether a checkpoint exists in the target directory.
+ """
+ save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+ return self.path_manager.exists(save_file)
+
+ def get_checkpoint_file(self) -> str:
+ """
+ Returns:
+ str: The latest checkpoint file in target directory.
+ """
+ save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+ try:
+ with self.path_manager.open(save_file, "r") as f:
+ last_saved = f.read().strip()
+ except IOError:
+ # if file doesn't exist, maybe because it has just been
+ # deleted by a separate process
+ return ""
+ # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got
+ # `Union[bytes, str]`.
+ return os.path.join(self.save_dir, last_saved)
+
+ def tag_last_checkpoint(self, last_filename_basename: str) -> None:
+ """
+ Tag the last checkpoint.
+
+ Args:
+ last_filename_basename (str): the basename of the last filename.
+ """
+ if distributed.is_enabled():
+ torch.distributed.barrier()
+ save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+ with self.path_manager.open(save_file, "w") as f:
+ f.write(last_filename_basename) # pyre-ignore
+
+
+ShardedGradScaler = ShardedGradScaler
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7189cdb04b5fb15d29a4cfeaf307dd5f71a0cb57
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de74c5fe8112dba58dc50c2e761f4b3883473de
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ attn_drop: float = 0.0,
+ proj_drop: float = 0.0,
+ ) -> None:
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x: Tensor) -> Tensor:
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+ attn = q @ k.transpose(-2, -1)
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class MemEffAttention(Attention):
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ if not XFORMERS_AVAILABLE:
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
+ return super().forward(x)
+
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+ q, k, v = unbind(qkv, 2)
+
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+ x = x.reshape([B, N, C])
+
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d78f7989be1d9fcb1d4a02ac624ef1e9a7364ac
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
@@ -0,0 +1,245 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ ffn_bias: bool = True,
+ drop: float = 0.0,
+ attn_drop: float = 0.0,
+ init_values=None,
+ drop_path: float = 0.0,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+ attn_class: Callable[..., nn.Module] = Attention,
+ ffn_layer: Callable[..., nn.Module] = Mlp,
+ ) -> None:
+ super().__init__()
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+ self.norm1 = norm_layer(dim)
+ self.attn = attn_class(
+ dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ attn_drop=attn_drop,
+ proj_drop=drop,
+ )
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = ffn_layer(
+ in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop,
+ bias=ffn_bias,
+ )
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.sample_drop_ratio = drop_path
+
+ def forward(self, x: Tensor) -> Tensor:
+ def attn_residual_func(x: Tensor) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x)))
+
+ def ffn_residual_func(x: Tensor) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ if self.training and self.sample_drop_ratio > 0.1:
+ # the overhead is compensated only for a drop path rate larger than 0.1
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ )
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ )
+ elif self.training and self.sample_drop_ratio > 0.0:
+ x = x + self.drop_path1(attn_residual_func(x))
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
+ else:
+ x = x + attn_residual_func(x)
+ x = x + ffn_residual_func(x)
+ return x
+
+
+def drop_add_residual_stochastic_depth(
+ x: Tensor,
+ residual_func: Callable[[Tensor], Tensor],
+ sample_drop_ratio: float = 0.0,
+) -> Tensor:
+ # 1) extract subset using permutation
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ x_subset = x[brange]
+
+ # 2) apply residual_func to get residual
+ residual = residual_func(x_subset)
+
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+
+ residual_scale_factor = b / sample_subset_size
+
+ # 3) add the residual
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ residual_scale_factor = b / sample_subset_size
+ return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+ if scaling_vector is None:
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ else:
+ x_plus_residual = scaled_index_add(
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+ )
+ return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+ """
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
+ """
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+ if all_shapes not in attn_bias_cache.keys():
+ seqlens = []
+ for b, x in zip(batch_sizes, x_list):
+ for _ in range(b):
+ seqlens.append(x.shape[1])
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+ attn_bias._batch_sizes = batch_sizes
+ attn_bias_cache[all_shapes] = attn_bias
+
+ if branges is not None:
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+ else:
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+ return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+ x_list: List[Tensor],
+ residual_func: Callable[[Tensor, Any], Tensor],
+ sample_drop_ratio: float = 0.0,
+ scaling_vector=None,
+) -> Tensor:
+ # 1) generate random set of indices for dropping samples in the batch
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+ branges = [s[0] for s in branges_scales]
+ residual_scale_factors = [s[1] for s in branges_scales]
+
+ # 2) get attention bias and index+concat the tensors
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+ # 3) apply residual_func to get residual, and split the result
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
+
+ outputs = []
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+ return outputs
+
+
+class NestedTensorBlock(Block):
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+ """
+ x_list contains a list of tensors to nest together and run
+ """
+ assert isinstance(self.attn, MemEffAttention)
+
+ if self.training and self.sample_drop_ratio > 0.0:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.mlp(self.norm2(x))
+
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ return x_list
+ else:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ attn_bias, x = get_attn_bias_and_cat(x_list)
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
+ x = x + ffn_residual_func(x)
+ return attn_bias.split(x)
+
+ def forward(self, x_or_x_list):
+ if isinstance(x_or_x_list, Tensor):
+ return super().forward(x_or_x_list)
+ elif isinstance(x_or_x_list, list):
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+ return self.forward_nested(x_or_x_list)
+ else:
+ raise AssertionError
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e9392f33d9c74d487294e7962f3e4ae55c71b91
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+
+
+class DINOHead(nn.Module):
+ def __init__(
+ self,
+ in_dim,
+ out_dim,
+ use_bn=False,
+ nlayers=3,
+ hidden_dim=2048,
+ bottleneck_dim=256,
+ mlp_bias=True,
+ ):
+ super().__init__()
+ nlayers = max(nlayers, 1)
+ self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+ self.apply(self._init_weights)
+ self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+ self.last_layer.weight_g.data.fill_(1)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=0.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+
+ def forward(self, x):
+ x = self.mlp(x)
+ eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+ x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+ x = self.last_layer(x)
+ return x
+
+
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+ if nlayers == 1:
+ return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+ else:
+ layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+ if use_bn:
+ layers.append(nn.BatchNorm1d(hidden_dim))
+ layers.append(nn.GELU())
+ for _ in range(nlayers - 2):
+ layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+ if use_bn:
+ layers.append(nn.BatchNorm1d(hidden_dim))
+ layers.append(nn.GELU())
+ layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+ return nn.Sequential(*layers)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c3bea8e40eec258bbe59087770d230a6375481
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+ if drop_prob == 0.0 or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0:
+ random_tensor.div_(keep_prob)
+ output = x * random_tensor
+ return output
+
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a4d0eedb1dc974a45e06fbe77ff3d909e36e55
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ init_values: Union[float, Tensor] = 1e-5,
+ inplace: bool = False,
+ ) -> None:
+ super().__init__()
+ self.inplace = inplace
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+ def forward(self, x: Tensor) -> Tensor:
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..504987b635c9cd582a352fb2381228c9e6cd043c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x: Tensor) -> Tensor:
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f880c042ee6a33ef520c6a8c8a686c1d065b8f49
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+ if isinstance(x, tuple):
+ assert len(x) == 2
+ return x
+
+ assert isinstance(x, int)
+ return (x, x)
+
+
+class PatchEmbed(nn.Module):
+ """
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+ Args:
+ img_size: Image size.
+ patch_size: Patch token size.
+ in_chans: Number of input image channels.
+ embed_dim: Number of linear projection output channels.
+ norm_layer: Normalization layer.
+ """
+
+ def __init__(
+ self,
+ img_size: Union[int, Tuple[int, int]] = 224,
+ patch_size: Union[int, Tuple[int, int]] = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ norm_layer: Optional[Callable] = None,
+ flatten_embedding: bool = True,
+ ) -> None:
+ super().__init__()
+
+ image_HW = make_2tuple(img_size)
+ patch_HW = make_2tuple(patch_size)
+ patch_grid_size = (
+ image_HW[0] // patch_HW[0],
+ image_HW[1] // patch_HW[1],
+ )
+
+ self.img_size = image_HW
+ self.patch_size = patch_HW
+ self.patches_resolution = patch_grid_size
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ self.flatten_embedding = flatten_embedding
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ def forward(self, x: Tensor) -> Tensor:
+ _, _, H, W = x.shape
+ patch_H, patch_W = self.patch_size
+
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+ x = self.proj(x) # B C H W
+ H, W = x.size(2), x.size(3)
+ x = x.flatten(2).transpose(1, 2) # B HW C
+ x = self.norm(x)
+ if not self.flatten_embedding:
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
+ return x
+
+ def flops(self) -> float:
+ Ho, Wo = self.patches_resolution
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+ if self.norm is not None:
+ flops += Ho * Wo * self.embed_dim
+ return flops
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
new file mode 100644
index 0000000000000000000000000000000000000000..155a3dd9f6f1a7d0f7bdf9c8f1981e58acb3b19c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+ def forward(self, x: Tensor) -> Tensor:
+ x12 = self.w12(x)
+ x1, x2 = x12.chunk(2, dim=-1)
+ hidden = F.silu(x1) * x2
+ return self.w3(hidden)
+
+
+try:
+ from xformers.ops import SwiGLU
+
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ SwiGLU = SwiGLUFFN
+ XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+ super().__init__(
+ in_features=in_features,
+ hidden_features=hidden_features,
+ out_features=out_features,
+ bias=bias,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..46fec07bc3042342e2b68b8739a72072058c7851
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+import os
+import sys
+from typing import Optional
+
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from .helpers import MetricLogger, SmoothedValue
+
+
+# So that calling _configure_logger multiple times won't add many handlers
+@functools.lru_cache()
+def _configure_logger(
+ name: Optional[str] = None,
+ *,
+ level: int = logging.DEBUG,
+ output: Optional[str] = None,
+):
+ """
+ Configure a logger.
+
+ Adapted from Detectron2.
+
+ Args:
+ name: The name of the logger to configure.
+ level: The logging level to use.
+ output: A file name or a directory to save log. If None, will not save log file.
+ If ends with ".txt" or ".log", assumed to be a file name.
+ Otherwise, logs will be saved to `output/log.txt`.
+
+ Returns:
+ The configured logger.
+ """
+
+ logger = logging.getLogger(name)
+ logger.setLevel(level)
+ logger.propagate = False
+
+ # Loosely match Google glog format:
+ # [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg
+ # but use a shorter timestamp and include the logger name:
+ # [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg
+ fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] "
+ fmt_message = "%(message)s"
+ fmt = fmt_prefix + fmt_message
+ datefmt = "%Y%m%d %H:%M:%S"
+ formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
+
+ # stdout logging for main worker only
+ if distributed.is_main_process():
+ handler = logging.StreamHandler(stream=sys.stdout)
+ handler.setLevel(logging.DEBUG)
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+
+ # file logging for all workers
+ if output:
+ if os.path.splitext(output)[-1] in (".txt", ".log"):
+ filename = output
+ else:
+ filename = os.path.join(output, "logs", "log.txt")
+
+ if not distributed.is_main_process():
+ global_rank = distributed.get_global_rank()
+ filename = filename + ".rank{}".format(global_rank)
+
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+ handler = logging.StreamHandler(open(filename, "a"))
+ handler.setLevel(logging.DEBUG)
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+
+ return logger
+
+
+def setup_logging(
+ output: Optional[str] = None,
+ *,
+ name: Optional[str] = None,
+ level: int = logging.DEBUG,
+ capture_warnings: bool = True,
+) -> None:
+ """
+ Setup logging.
+
+ Args:
+ output: A file name or a directory to save log files. If None, log
+ files will not be saved. If output ends with ".txt" or ".log", it
+ is assumed to be a file name.
+ Otherwise, logs will be saved to `output/log.txt`.
+ name: The name of the logger to configure, by default the root logger.
+ level: The logging level to use.
+ capture_warnings: Whether warnings should be captured as logs.
+ """
+ logging.captureWarnings(capture_warnings)
+ _configure_logger(name, level=level, output=output)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f11fab956a8eff56aa63cc463fbc8f78181bb9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
@@ -0,0 +1,195 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict, deque
+import datetime
+import json
+import logging
+import time
+
+import torch
+
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricLogger(object):
+ def __init__(self, delimiter="\t", output_file=None):
+ self.meters = defaultdict(SmoothedValue)
+ self.delimiter = delimiter
+ self.output_file = output_file
+
+ def update(self, **kwargs):
+ for k, v in kwargs.items():
+ if isinstance(v, torch.Tensor):
+ v = v.item()
+ assert isinstance(v, (float, int))
+ self.meters[k].update(v)
+
+ def __getattr__(self, attr):
+ if attr in self.meters:
+ return self.meters[attr]
+ if attr in self.__dict__:
+ return self.__dict__[attr]
+ raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+
+ def __str__(self):
+ loss_str = []
+ for name, meter in self.meters.items():
+ loss_str.append("{}: {}".format(name, str(meter)))
+ return self.delimiter.join(loss_str)
+
+ def synchronize_between_processes(self):
+ for meter in self.meters.values():
+ meter.synchronize_between_processes()
+
+ def add_meter(self, name, meter):
+ self.meters[name] = meter
+
+ def dump_in_output_file(self, iteration, iter_time, data_time):
+ if self.output_file is None or not distributed.is_main_process():
+ return
+ dict_to_dump = dict(
+ iteration=iteration,
+ iter_time=iter_time,
+ data_time=data_time,
+ )
+ dict_to_dump.update({k: v.median for k, v in self.meters.items()})
+ with open(self.output_file, "a") as f:
+ f.write(json.dumps(dict_to_dump) + "\n")
+ pass
+
+ def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0):
+ i = start_iteration
+ if not header:
+ header = ""
+ start_time = time.time()
+ end = time.time()
+ iter_time = SmoothedValue(fmt="{avg:.6f}")
+ data_time = SmoothedValue(fmt="{avg:.6f}")
+
+ if n_iterations is None:
+ n_iterations = len(iterable)
+
+ space_fmt = ":" + str(len(str(n_iterations))) + "d"
+
+ log_list = [
+ header,
+ "[{0" + space_fmt + "}/{1}]",
+ "eta: {eta}",
+ "{meters}",
+ "time: {time}",
+ "data: {data}",
+ ]
+ if torch.cuda.is_available():
+ log_list += ["max mem: {memory:.0f}"]
+
+ log_msg = self.delimiter.join(log_list)
+ MB = 1024.0 * 1024.0
+ for obj in iterable:
+ data_time.update(time.time() - end)
+ yield obj
+ iter_time.update(time.time() - end)
+ if i % print_freq == 0 or i == n_iterations - 1:
+ self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg)
+ eta_seconds = iter_time.global_avg * (n_iterations - i)
+ eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+ if torch.cuda.is_available():
+ logger.info(
+ log_msg.format(
+ i,
+ n_iterations,
+ eta=eta_string,
+ meters=str(self),
+ time=str(iter_time),
+ data=str(data_time),
+ memory=torch.cuda.max_memory_allocated() / MB,
+ )
+ )
+ else:
+ logger.info(
+ log_msg.format(
+ i,
+ n_iterations,
+ eta=eta_string,
+ meters=str(self),
+ time=str(iter_time),
+ data=str(data_time),
+ )
+ )
+ i += 1
+ end = time.time()
+ if i >= n_iterations:
+ break
+ total_time = time.time() - start_time
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+ logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations))
+
+
+class SmoothedValue:
+ """Track a series of values and provide access to smoothed values over a
+ window or the global series average.
+ """
+
+ def __init__(self, window_size=20, fmt=None):
+ if fmt is None:
+ fmt = "{median:.4f} ({global_avg:.4f})"
+ self.deque = deque(maxlen=window_size)
+ self.total = 0.0
+ self.count = 0
+ self.fmt = fmt
+
+ def update(self, value, num=1):
+ self.deque.append(value)
+ self.count += num
+ self.total += value * num
+
+ def synchronize_between_processes(self):
+ """
+ Distributed synchronization of the metric
+ Warning: does not synchronize the deque!
+ """
+ if not distributed.is_enabled():
+ return
+ t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+ torch.distributed.barrier()
+ torch.distributed.all_reduce(t)
+ t = t.tolist()
+ self.count = int(t[0])
+ self.total = t[1]
+
+ @property
+ def median(self):
+ d = torch.tensor(list(self.deque))
+ return d.median().item()
+
+ @property
+ def avg(self):
+ d = torch.tensor(list(self.deque), dtype=torch.float32)
+ return d.mean().item()
+
+ @property
+ def global_avg(self):
+ return self.total / self.count
+
+ @property
+ def max(self):
+ return max(self.deque)
+
+ @property
+ def value(self):
+ return self.deque[-1]
+
+ def __str__(self):
+ return self.fmt.format(
+ median=self.median,
+ avg=self.avg,
+ global_avg=self.global_avg,
+ max=self.max,
+ value=self.value,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b27240f645c1b68419dccc43c349107b3790b9b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_clstoken_loss import DINOLoss
+from .ibot_patch_loss import iBOTPatchLoss
+from .koleo_loss import KoLeoLoss
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde5375dd7da512887bdd77ed4f7b626eafb01cb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+
+class DINOLoss(nn.Module):
+ def __init__(
+ self,
+ out_dim,
+ student_temp=0.1,
+ center_momentum=0.9,
+ ):
+ super().__init__()
+ self.student_temp = student_temp
+ self.center_momentum = center_momentum
+ self.register_buffer("center", torch.zeros(1, out_dim))
+ self.updated = True
+ self.reduce_handle = None
+ self.len_teacher_output = None
+ self.async_batch_center = None
+
+ @torch.no_grad()
+ def softmax_center_teacher(self, teacher_output, teacher_temp):
+ self.apply_center_update()
+ # teacher centering and sharpening
+ return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1)
+
+ @torch.no_grad()
+ def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3):
+ teacher_output = teacher_output.float()
+ world_size = dist.get_world_size() if dist.is_initialized() else 1
+ Q = torch.exp(teacher_output / teacher_temp).t() # Q is K-by-B for consistency with notations from our paper
+ B = Q.shape[1] * world_size # number of samples to assign
+ K = Q.shape[0] # how many prototypes
+
+ # make the matrix sums to 1
+ sum_Q = torch.sum(Q)
+ if dist.is_initialized():
+ dist.all_reduce(sum_Q)
+ Q /= sum_Q
+
+ for it in range(n_iterations):
+ # normalize each row: total weight per prototype must be 1/K
+ sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+ if dist.is_initialized():
+ dist.all_reduce(sum_of_rows)
+ Q /= sum_of_rows
+ Q /= K
+
+ # normalize each column: total weight per sample must be 1/B
+ Q /= torch.sum(Q, dim=0, keepdim=True)
+ Q /= B
+
+ Q *= B # the columns must sum to 1 so that Q is an assignment
+ return Q.t()
+
+ def forward(self, student_output_list, teacher_out_softmaxed_centered_list):
+ """
+ Cross-entropy between softmax outputs of the teacher and student networks.
+ """
+ # TODO: Use cross_entropy_distribution here
+ total_loss = 0
+ for s in student_output_list:
+ lsm = F.log_softmax(s / self.student_temp, dim=-1)
+ for t in teacher_out_softmaxed_centered_list:
+ loss = torch.sum(t * lsm, dim=-1)
+ total_loss -= loss.mean()
+ return total_loss
+
+ @torch.no_grad()
+ def update_center(self, teacher_output):
+ self.reduce_center_update(teacher_output)
+
+ @torch.no_grad()
+ def reduce_center_update(self, teacher_output):
+ self.updated = False
+ self.len_teacher_output = len(teacher_output)
+ self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
+ if dist.is_initialized():
+ self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+ @torch.no_grad()
+ def apply_center_update(self):
+ if self.updated is False:
+ world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+ if self.reduce_handle is not None:
+ self.reduce_handle.wait()
+ _t = self.async_batch_center / (self.len_teacher_output * world_size)
+
+ self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+ self.updated = True
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5234362f80a0ad06e6f294b129828995aa6884a0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+ from xformers.ops import cross_entropy
+
+ def lossfunc(t, s, temp):
+ s = s.float()
+ t = t.float()
+ if s.ndim == 2:
+ return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0)
+ elif s.ndim == 3:
+ return -cross_entropy(s, t, temp, bw_inplace=True)
+
+except ImportError:
+
+ def lossfunc(t, s, temp):
+ return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1)
+
+
+class iBOTPatchLoss(nn.Module):
+ def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9):
+ super().__init__()
+ self.student_temp = student_temp
+ self.center_momentum = center_momentum
+ self.register_buffer("center", torch.zeros(1, 1, patch_out_dim))
+ self.updated = True
+ self.reduce_handle = None
+ self.len_teacher_patch_tokens = None
+ self.async_batch_center = None
+
+ @torch.no_grad()
+ def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp):
+ self.apply_center_update()
+ # teacher centering and sharpening
+ #
+ # WARNING:
+ # as self.center is a float32, everything gets casted to float32 afterwards
+ #
+ # teacher_patch_tokens = teacher_patch_tokens.float()
+ # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1)
+
+ return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1)
+
+ # this is experimental, keep everything in float16 and let's see what happens:
+ # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1)
+
+ @torch.no_grad()
+ def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3):
+ teacher_output = teacher_output.float()
+ # world_size = dist.get_world_size() if dist.is_initialized() else 1
+ Q = torch.exp(teacher_output / teacher_temp).t() # Q is K-by-B for consistency with notations from our paper
+ # B = Q.shape[1] * world_size # number of samples to assign
+ B = n_masked_patches_tensor
+ dist.all_reduce(B)
+ K = Q.shape[0] # how many prototypes
+
+ # make the matrix sums to 1
+ sum_Q = torch.sum(Q)
+ if dist.is_initialized():
+ dist.all_reduce(sum_Q)
+ Q /= sum_Q
+
+ for it in range(n_iterations):
+ # normalize each row: total weight per prototype must be 1/K
+ sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+ if dist.is_initialized():
+ dist.all_reduce(sum_of_rows)
+ Q /= sum_of_rows
+ Q /= K
+
+ # normalize each column: total weight per sample must be 1/B
+ Q /= torch.sum(Q, dim=0, keepdim=True)
+ Q /= B
+
+ Q *= B # the columns must sum to 1 so that Q is an assignment
+ return Q.t()
+
+ def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat):
+ """
+ Cross-entropy between softmax outputs of the teacher and student networks.
+ student_patch_tokens: (B, N, D) tensor
+ teacher_patch_tokens: (B, N, D) tensor
+ student_masks_flat: (B, N) tensor
+ """
+ t = teacher_patch_tokens
+ s = student_patch_tokens
+ loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+ loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0)
+ return -loss.mean()
+
+ def forward_masked(
+ self,
+ student_patch_tokens_masked,
+ teacher_patch_tokens_masked,
+ student_masks_flat,
+ n_masked_patches=None,
+ masks_weight=None,
+ ):
+ t = teacher_patch_tokens_masked
+ s = student_patch_tokens_masked
+ # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+ loss = lossfunc(t, s, self.student_temp)
+ if masks_weight is None:
+ masks_weight = (
+ (1 / student_masks_flat.sum(-1).clamp(min=1.0))
+ .unsqueeze(-1)
+ .expand_as(student_masks_flat)[student_masks_flat]
+ )
+ if n_masked_patches is not None:
+ loss = loss[:n_masked_patches]
+ loss = loss * masks_weight
+ return -loss.sum() / student_masks_flat.shape[0]
+
+ @torch.no_grad()
+ def update_center(self, teacher_patch_tokens):
+ self.reduce_center_update(teacher_patch_tokens)
+
+ @torch.no_grad()
+ def reduce_center_update(self, teacher_patch_tokens):
+ self.updated = False
+ self.len_teacher_patch_tokens = len(teacher_patch_tokens)
+ self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True)
+ if dist.is_initialized():
+ self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+ @torch.no_grad()
+ def apply_center_update(self):
+ if self.updated is False:
+ world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+ if self.reduce_handle is not None:
+ self.reduce_handle.wait()
+ _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size)
+
+ self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+ self.updated = True
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7230c9d6d6f847918d5fe68bae2041b8556d5c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# import torch.distributed as dist
+
+
+logger = logging.getLogger("dinov2")
+
+
+class KoLeoLoss(nn.Module):
+ """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
+
+ def __init__(self):
+ super().__init__()
+ self.pdist = nn.PairwiseDistance(2, eps=1e-8)
+
+ def pairwise_NNs_inner(self, x):
+ """
+ Pairwise nearest neighbors for L2-normalized vectors.
+ Uses Torch rather than Faiss to remain on GPU.
+ """
+ # parwise dot products (= inverse distance)
+ dots = torch.mm(x, x.t())
+ n = x.shape[0]
+ dots.view(-1)[:: (n + 1)].fill_(-1) # Trick to fill diagonal with -1
+ # max inner prod -> min distance
+ _, I = torch.max(dots, dim=1) # noqa: E741
+ return I
+
+ def forward(self, student_output, eps=1e-8):
+ """
+ Args:
+ student_output (BxD): backbone output of student
+ """
+ with torch.cuda.amp.autocast(enabled=False):
+ student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
+ I = self.pairwise_NNs_inner(student_output) # noqa: E741
+ distances = self.pdist(student_output, student_output[I]) # BxD, BxD -> B
+ loss = -torch.log(distances + eps).mean()
+ return loss
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fea19c32020056d79e3583fff30a1157b3563b2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from . import vision_transformer as vits
+
+
+logger = logging.getLogger("dinov2")
+
+
+def build_model(args, only_teacher=False, img_size=224):
+ args.arch = args.arch.removesuffix("_memeff")
+ if "vit" in args.arch:
+ vit_kwargs = dict(
+ img_size=img_size,
+ patch_size=args.patch_size,
+ init_values=args.layerscale,
+ ffn_layer=args.ffn_layer,
+ block_chunks=args.block_chunks,
+ qkv_bias=args.qkv_bias,
+ proj_bias=args.proj_bias,
+ ffn_bias=args.ffn_bias,
+ )
+ teacher = vits.__dict__[args.arch](**vit_kwargs)
+ if only_teacher:
+ return teacher, teacher.embed_dim
+ student = vits.__dict__[args.arch](
+ **vit_kwargs,
+ drop_path_rate=args.drop_path_rate,
+ drop_path_uniform=args.drop_path_uniform,
+ )
+ embed_dim = student.embed_dim
+ return student, teacher, embed_dim
+
+
+def build_model_from_cfg(cfg, only_teacher=False):
+ return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e0def226e442e86ddfd002f1bcf1aa8d28042e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
@@ -0,0 +1,358 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+ if not depth_first and include_root:
+ fn(module=module, name=name)
+ for child_name, child_module in module.named_children():
+ child_name = ".".join((name, child_name)) if name else child_name
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+ if depth_first and include_root:
+ fn(module=module, name=name)
+ return module
+
+
+class BlockChunk(nn.ModuleList):
+ def forward(self, x):
+ for b in self:
+ x = b(x)
+ return x
+
+
+class DinoVisionTransformer(nn.Module):
+ def __init__(
+ self,
+ img_size=224,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ ffn_bias=True,
+ proj_bias=True,
+ drop_path_rate=0.0,
+ drop_path_uniform=False,
+ init_values=None, # for layerscale: None or 0 => no layerscale
+ embed_layer=PatchEmbed,
+ act_layer=nn.GELU,
+ block_fn=Block,
+ ffn_layer="mlp",
+ block_chunks=1,
+ ):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ proj_bias (bool): enable bias for proj in attn if True
+ ffn_bias (bool): enable bias for ffn if True
+ drop_path_rate (float): stochastic depth rate
+ drop_path_uniform (bool): apply uniform drop rate across blocks
+ weight_init (str): weight init scheme
+ init_values (float): layer-scale init values
+ embed_layer (nn.Module): patch embedding layer
+ act_layer (nn.Module): MLP activation layer
+ block_fn (nn.Module): transformer block class
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+ """
+ super().__init__()
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 1
+ self.n_blocks = depth
+ self.num_heads = num_heads
+ self.patch_size = patch_size
+
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+
+ if drop_path_uniform is True:
+ dpr = [drop_path_rate] * depth
+ else:
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+
+ if ffn_layer == "mlp":
+ logger.info("using MLP layer as FFN")
+ ffn_layer = Mlp
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+ logger.info("using SwiGLU layer as FFN")
+ ffn_layer = SwiGLUFFNFused
+ elif ffn_layer == "identity":
+ logger.info("using Identity layer as FFN")
+
+ def f(*args, **kwargs):
+ return nn.Identity()
+
+ ffn_layer = f
+ else:
+ raise NotImplementedError
+
+ blocks_list = [
+ block_fn(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ ffn_bias=ffn_bias,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ ffn_layer=ffn_layer,
+ init_values=init_values,
+ )
+ for i in range(depth)
+ ]
+ if block_chunks > 0:
+ self.chunked_blocks = True
+ chunked_blocks = []
+ chunksize = depth // block_chunks
+ for i in range(0, depth, chunksize):
+ # this is to keep the block index consistent if we chunk the block list
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+ else:
+ self.chunked_blocks = False
+ self.blocks = nn.ModuleList(blocks_list)
+
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Identity()
+
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+ self.init_weights()
+
+ def init_weights(self):
+ trunc_normal_(self.pos_embed, std=0.02)
+ nn.init.normal_(self.cls_token, std=1e-6)
+ named_apply(init_weights_vit_timm, self)
+
+ def interpolate_pos_encoding(self, x, w, h):
+ previous_dtype = x.dtype
+ npatch = x.shape[1] - 1
+ N = self.pos_embed.shape[1] - 1
+ if npatch == N and w == h:
+ return self.pos_embed
+ pos_embed = self.pos_embed.float()
+ class_pos_embed = pos_embed[:, 0]
+ patch_pos_embed = pos_embed[:, 1:]
+ dim = x.shape[-1]
+ w0 = w // self.patch_size
+ h0 = h // self.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ w0, h0 = w0 + 0.1, h0 + 0.1
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+ scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+ mode="bicubic",
+ )
+
+ assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+ def prepare_tokens_with_masks(self, x, masks=None):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x)
+ if masks is not None:
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+ x = x + self.interpolate_pos_encoding(x, w, h)
+
+ return x
+
+ def forward_features_list(self, x_list, masks_list):
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+ for blk in self.blocks:
+ x = blk(x)
+
+ all_x = x
+ output = []
+ for x, masks in zip(all_x, masks_list):
+ x_norm = self.norm(x)
+ output.append(
+ {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_patchtokens": x_norm[:, 1:],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+ )
+ return output
+
+ def forward_features(self, x, masks=None):
+ if isinstance(x, list):
+ return self.forward_features_list(x, masks)
+
+ x = self.prepare_tokens_with_masks(x, masks)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x_norm = self.norm(x)
+ return {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_patchtokens": x_norm[:, 1:],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ # If n is an int, take the n last blocks. If it's a list, take them
+ output, total_block_len = [], len(self.blocks)
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def _get_intermediate_layers_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
+ # If n is an int, take the n last blocks. If it's a list, take them
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for block_chunk in self.blocks:
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ i += 1
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def get_intermediate_layers(
+ self,
+ x: torch.Tensor,
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
+ reshape: bool = False,
+ return_class_token: bool = False,
+ norm=True,
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+ if self.chunked_blocks:
+ outputs = self._get_intermediate_layers_chunked(x, n)
+ else:
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
+ if norm:
+ outputs = [self.norm(out) for out in outputs]
+ class_tokens = [out[:, 0] for out in outputs]
+ outputs = [out[:, 1:] for out in outputs]
+ if reshape:
+ B, _, w, h = x.shape
+ outputs = [
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+ for out in outputs
+ ]
+ if return_class_token:
+ return tuple(zip(outputs, class_tokens))
+ return tuple(outputs)
+
+ def forward(self, *args, is_training=False, **kwargs):
+ ret = self.forward_features(*args, **kwargs)
+ if is_training:
+ return ret
+ else:
+ return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+ """ViT weight initialization, original timm impl (for reproducibility)"""
+ if isinstance(module, nn.Linear):
+ trunc_normal_(module.weight, std=0.02)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
+
+
+def vit_base(patch_size=16, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
+
+
+def vit_large(patch_size=16, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
+
+
+def vit_giant2(patch_size=16, **kwargs):
+ """
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+ """
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1536,
+ depth=40,
+ num_heads=24,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4196294309799347172dba54a17360698071ca8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc027e6f4a308c488d5ac0869a99bc11b1b1ed9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.knn import get_args_parser as get_knn_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import setup_logging
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+ def __init__(self, args):
+ self.args = args
+
+ def __call__(self):
+ from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.knn import main as knn_main
+
+ self._setup_args()
+ knn_main(self.args)
+
+ def checkpoint(self):
+ import submitit
+
+ logger.info(f"Requeuing {self.args}")
+ empty = type(self)(self.args)
+ return submitit.helpers.DelayedSubmission(empty)
+
+ def _setup_args(self):
+ import submitit
+
+ job_env = submitit.JobEnvironment()
+ self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+ logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+ logger.info(f"Args: {self.args}")
+
+
+def main():
+ description = "Submitit launcher for DINOv2 k-NN evaluation"
+ knn_args_parser = get_knn_args_parser(add_help=False)
+ parents = [knn_args_parser]
+ args_parser = get_args_parser(description=description, parents=parents)
+ args = args_parser.parse_args()
+
+ setup_logging()
+
+ assert os.path.exists(args.config_file), "Configuration file does not exist!"
+ submit_jobs(Evaluator, args, name="dinov2:knn")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecf1c1f201302ac89ea3a5d6eb675bc790e92bb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.linear import get_args_parser as get_linear_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import setup_logging
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+ def __init__(self, args):
+ self.args = args
+
+ def __call__(self):
+ from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.linear import main as linear_main
+
+ self._setup_args()
+ linear_main(self.args)
+
+ def checkpoint(self):
+ import submitit
+
+ logger.info(f"Requeuing {self.args}")
+ empty = type(self)(self.args)
+ return submitit.helpers.DelayedSubmission(empty)
+
+ def _setup_args(self):
+ import submitit
+
+ job_env = submitit.JobEnvironment()
+ self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+ logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+ logger.info(f"Args: {self.args}")
+
+
+def main():
+ description = "Submitit launcher for DINOv2 linear evaluation"
+ linear_args_parser = get_linear_args_parser(add_help=False)
+ parents = [linear_args_parser]
+ args_parser = get_args_parser(description=description, parents=parents)
+ args = args_parser.parse_args()
+
+ setup_logging()
+
+ assert os.path.exists(args.config_file), "Configuration file does not exist!"
+ submit_jobs(Evaluator, args, name="dinov2:linear")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..68845c4a0dee050c1b4b872874dad6971b28adfb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import setup_logging
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+ def __init__(self, args):
+ self.args = args
+
+ def __call__(self):
+ from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.eval.log_regression import main as log_regression_main
+
+ self._setup_args()
+ log_regression_main(self.args)
+
+ def checkpoint(self):
+ import submitit
+
+ logger.info(f"Requeuing {self.args}")
+ empty = type(self)(self.args)
+ return submitit.helpers.DelayedSubmission(empty)
+
+ def _setup_args(self):
+ import submitit
+
+ job_env = submitit.JobEnvironment()
+ self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+ logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+ logger.info(f"Args: {self.args}")
+
+
+def main():
+ description = "Submitit launcher for DINOv2 logistic evaluation"
+ log_regression_args_parser = get_log_regression_args_parser(add_help=False)
+ parents = [log_regression_args_parser]
+ args_parser = get_args_parser(description=description, parents=parents)
+ args = args_parser.parse_args()
+
+ setup_logging()
+
+ assert os.path.exists(args.config_file), "Configuration file does not exist!"
+ submit_jobs(Evaluator, args, name="dinov2:logreg")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e43c6307018a60058739a56d2650810ea9977016
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import submitit
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.cluster import (
+ get_slurm_executor_parameters,
+ get_slurm_partition,
+ get_user_checkpoint_path,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+ description: Optional[str] = None,
+ parents: Optional[List[argparse.ArgumentParser]] = None,
+ add_help: bool = True,
+) -> argparse.ArgumentParser:
+ parents = parents or []
+ slurm_partition = get_slurm_partition()
+ parser = argparse.ArgumentParser(
+ description=description,
+ parents=parents,
+ add_help=add_help,
+ )
+ parser.add_argument(
+ "--ngpus",
+ "--gpus",
+ "--gpus-per-node",
+ default=8,
+ type=int,
+ help="Number of GPUs to request on each node",
+ )
+ parser.add_argument(
+ "--nodes",
+ "--nnodes",
+ default=2,
+ type=int,
+ help="Number of nodes to request",
+ )
+ parser.add_argument(
+ "--timeout",
+ default=2800,
+ type=int,
+ help="Duration of the job",
+ )
+ parser.add_argument(
+ "--partition",
+ default=slurm_partition,
+ type=str,
+ help="Partition where to submit",
+ )
+ parser.add_argument(
+ "--use-volta32",
+ action="store_true",
+ help="Request V100-32GB GPUs",
+ )
+ parser.add_argument(
+ "--comment",
+ default="",
+ type=str,
+ help="Comment to pass to scheduler, e.g. priority message",
+ )
+ parser.add_argument(
+ "--exclude",
+ default="",
+ type=str,
+ help="Nodes to exclude",
+ )
+ return parser
+
+
+def get_shared_folder() -> Path:
+ user_checkpoint_path = get_user_checkpoint_path()
+ if user_checkpoint_path is None:
+ raise RuntimeError("Path to user checkpoint cannot be determined")
+ path = user_checkpoint_path / "experiments"
+ path.mkdir(exist_ok=True)
+ return path
+
+
+def submit_jobs(task_class, args, name: str):
+ if not args.output_dir:
+ args.output_dir = str(get_shared_folder() / "%j")
+
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+ executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
+
+ kwargs = {}
+ if args.use_volta32:
+ kwargs["slurm_constraint"] = "volta32gb"
+ if args.comment:
+ kwargs["slurm_comment"] = args.comment
+ if args.exclude:
+ kwargs["slurm_exclude"] = args.exclude
+
+ executor_params = get_slurm_executor_parameters(
+ nodes=args.nodes,
+ num_gpus_per_node=args.ngpus,
+ timeout_min=args.timeout, # max is 60 * 72
+ slurm_signal_delay_s=120,
+ slurm_partition=args.partition,
+ **kwargs,
+ )
+ executor.update_parameters(name=name, **executor_params)
+
+ task = task_class(args)
+ job = executor.submit(task)
+
+ logger.info(f"Submitted job_id: {job.job_id}")
+ str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
+ logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd447289e4815fc3decfdd73061eb3986bedcaf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import setup_logging
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.train import get_args_parser as get_train_args_parser
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Trainer(object):
+ def __init__(self, args):
+ self.args = args
+
+ def __call__(self):
+ from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.train import main as train_main
+
+ self._setup_args()
+ train_main(self.args)
+
+ def checkpoint(self):
+ import submitit
+
+ logger.info(f"Requeuing {self.args}")
+ empty = type(self)(self.args)
+ return submitit.helpers.DelayedSubmission(empty)
+
+ def _setup_args(self):
+ import submitit
+
+ job_env = submitit.JobEnvironment()
+ self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+ logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+ logger.info(f"Args: {self.args}")
+
+
+def main():
+ description = "Submitit launcher for DINOv2 training"
+ train_args_parser = get_train_args_parser(add_help=False)
+ parents = [train_args_parser]
+ args_parser = get_args_parser(description=description, parents=parents)
+ args = args_parser.parse_args()
+
+ setup_logging()
+
+ assert os.path.exists(args.config_file), "Configuration file does not exist!"
+ submit_jobs(Trainer, args, name="dinov2:train")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..873f67e948943f2dd3bdc9c111e8cf4ad4425d6e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .train import get_args_parser, main
+from .ssl_meta_arch import SSLMetaArch
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c6cdeabd169fea7859761e74028ea146dbd265
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
@@ -0,0 +1,403 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+import logging
+
+import torch
+from torch import nn
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.loss import DINOLoss, iBOTPatchLoss, KoLeoLoss
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.models import build_model_from_cfg
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.layers import DINOHead
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.utils import has_batchnorms
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.param_groups import get_params_groups_with_decay, fuse_params_groups
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.fsdp import get_fsdp_wrapper, ShardedGradScaler, get_fsdp_modules, reshard_fsdp_model
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.models.vision_transformer import BlockChunk
+
+try:
+ from xformers.ops import fmha
+
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ XFORMERS_AVAILABLE = False
+assert XFORMERS_AVAILABLE, "xFormers is required for DINOv2 training"
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SSLMetaArch(nn.Module):
+ def __init__(self, cfg):
+ super().__init__()
+ self.cfg = cfg
+ self.fp16_scaler = ShardedGradScaler() if cfg.compute_precision.grad_scaler else None
+
+ student_model_dict = dict()
+ teacher_model_dict = dict()
+
+ student_backbone, teacher_backbone, embed_dim = build_model_from_cfg(cfg)
+ student_model_dict["backbone"] = student_backbone
+ teacher_model_dict["backbone"] = teacher_backbone
+ logger.info(f"OPTIONS -- architecture : embed_dim: {embed_dim}")
+
+ if cfg.student.pretrained_weights:
+ chkpt = torch.load(cfg.student.pretrained_weights)
+ logger.info(f"OPTIONS -- pretrained weights: loading from {cfg.student.pretrained_weights}")
+ student_backbone.load_state_dict(chkpt["model"], strict=False)
+
+ self.embed_dim = embed_dim
+ self.dino_out_dim = cfg.dino.head_n_prototypes
+
+ self.do_dino = cfg.dino.loss_weight > 0
+ self.do_koleo = cfg.dino.koleo_loss_weight > 0
+ self.do_ibot = cfg.ibot.loss_weight > 0
+ self.ibot_separate_head = cfg.ibot.separate_head
+
+ logger.info("OPTIONS -- DINO")
+ if self.do_dino:
+ logger.info(f"OPTIONS -- DINO -- loss_weight: {cfg.dino.loss_weight}")
+ logger.info(f"OPTIONS -- DINO -- head_n_prototypes: {cfg.dino.head_n_prototypes}")
+ logger.info(f"OPTIONS -- DINO -- head_bottleneck_dim: {cfg.dino.head_bottleneck_dim}")
+ logger.info(f"OPTIONS -- DINO -- head_hidden_dim: {cfg.dino.head_hidden_dim}")
+ self.dino_loss_weight = cfg.dino.loss_weight
+ dino_head = partial(
+ DINOHead,
+ in_dim=embed_dim,
+ out_dim=cfg.dino.head_n_prototypes,
+ hidden_dim=cfg.dino.head_hidden_dim,
+ bottleneck_dim=cfg.dino.head_bottleneck_dim,
+ nlayers=cfg.dino.head_nlayers,
+ )
+ self.dino_loss = DINOLoss(self.dino_out_dim)
+ if self.do_koleo:
+ logger.info("OPTIONS -- DINO -- applying KOLEO regularization")
+ self.koleo_loss = KoLeoLoss()
+
+ else:
+ logger.info("OPTIONS -- DINO -- not using DINO")
+
+ if self.do_dino or self.do_ibot:
+ student_model_dict["dino_head"] = dino_head()
+ teacher_model_dict["dino_head"] = dino_head()
+
+ logger.info("OPTIONS -- IBOT")
+ logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+ logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_ratio_tuple: {cfg.ibot.mask_ratio_min_max}")
+ logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_sample_probability: {cfg.ibot.mask_sample_probability}")
+ if self.do_ibot:
+ self.ibot_loss_weight = cfg.ibot.loss_weight
+ assert max(cfg.ibot.mask_ratio_min_max) > 0, "please provide a positive mask ratio tuple for ibot"
+ assert cfg.ibot.mask_sample_probability > 0, "please provide a positive mask probability for ibot"
+ self.ibot_out_dim = cfg.ibot.head_n_prototypes if self.ibot_separate_head else cfg.dino.head_n_prototypes
+ self.ibot_patch_loss = iBOTPatchLoss(self.ibot_out_dim)
+ if self.ibot_separate_head:
+ logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+ logger.info(f"OPTIONS -- IBOT -- head_n_prototypes: {cfg.ibot.head_n_prototypes}")
+ logger.info(f"OPTIONS -- IBOT -- head_bottleneck_dim: {cfg.ibot.head_bottleneck_dim}")
+ logger.info(f"OPTIONS -- IBOT -- head_hidden_dim: {cfg.ibot.head_hidden_dim}")
+ ibot_head = partial(
+ DINOHead,
+ in_dim=embed_dim,
+ out_dim=cfg.ibot.head_n_prototypes,
+ hidden_dim=cfg.ibot.head_hidden_dim,
+ bottleneck_dim=cfg.ibot.head_bottleneck_dim,
+ nlayers=cfg.ibot.head_nlayers,
+ )
+ student_model_dict["ibot_head"] = ibot_head()
+ teacher_model_dict["ibot_head"] = ibot_head()
+ else:
+ logger.info("OPTIONS -- IBOT -- head shared with DINO")
+
+ self.need_to_synchronize_fsdp_streams = True
+
+ self.student = nn.ModuleDict(student_model_dict)
+ self.teacher = nn.ModuleDict(teacher_model_dict)
+
+ # there is no backpropagation through the teacher, so no need for gradients
+ for p in self.teacher.parameters():
+ p.requires_grad = False
+ logger.info(f"Student and Teacher are built: they are both {cfg.student.arch} network.")
+
+ def forward(self, inputs):
+ raise NotImplementedError
+
+ def backprop_loss(self, loss):
+ if self.fp16_scaler is not None:
+ self.fp16_scaler.scale(loss).backward()
+ else:
+ loss.backward()
+
+ def forward_backward(self, images, teacher_temp):
+ n_global_crops = 2
+ assert n_global_crops == 2
+ n_local_crops = self.cfg.crops.local_crops_number
+
+ global_crops = images["collated_global_crops"].cuda(non_blocking=True)
+ local_crops = images["collated_local_crops"].cuda(non_blocking=True)
+
+ masks = images["collated_masks"].cuda(non_blocking=True)
+ mask_indices_list = images["mask_indices_list"].cuda(non_blocking=True)
+ n_masked_patches_tensor = images["n_masked_patches"].cuda(non_blocking=True)
+ n_masked_patches = mask_indices_list.shape[0]
+ upperbound = images["upperbound"]
+ masks_weight = images["masks_weight"].cuda(non_blocking=True)
+
+ n_local_crops_loss_terms = max(n_local_crops * n_global_crops, 1)
+ n_global_crops_loss_terms = (n_global_crops - 1) * n_global_crops
+
+ do_dino = self.do_dino
+ do_ibot = self.do_ibot
+
+ # loss scales
+ ibot_loss_scale = 1.0 / n_global_crops
+
+ # teacher output
+ @torch.no_grad()
+ def get_teacher_output():
+ x, n_global_crops_teacher = global_crops, n_global_crops
+ teacher_backbone_output_dict = self.teacher.backbone(x, is_training=True)
+ teacher_cls_tokens = teacher_backbone_output_dict["x_norm_clstoken"]
+ teacher_cls_tokens = teacher_cls_tokens.chunk(n_global_crops_teacher)
+ # watch out: these are chunked and cat'd in reverse so A is matched to B in the global crops dino loss
+ teacher_cls_tokens = torch.cat((teacher_cls_tokens[1], teacher_cls_tokens[0]))
+ ibot_teacher_patch_tokens = teacher_backbone_output_dict["x_norm_patchtokens"]
+ _dim = ibot_teacher_patch_tokens.shape[-1]
+ n_cls_tokens = teacher_cls_tokens.shape[0]
+
+ if do_ibot and not self.ibot_separate_head:
+ buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound + n_cls_tokens, _dim)
+ buffer_tensor_teacher[:n_cls_tokens].copy_(teacher_cls_tokens)
+ torch.index_select(
+ ibot_teacher_patch_tokens.flatten(0, 1),
+ dim=0,
+ index=mask_indices_list,
+ out=buffer_tensor_teacher[n_cls_tokens : n_cls_tokens + n_masked_patches],
+ )
+ tokens_after_head = self.teacher.dino_head(buffer_tensor_teacher)
+ teacher_cls_tokens_after_head = tokens_after_head[:n_cls_tokens]
+ masked_teacher_patch_tokens_after_head = tokens_after_head[
+ n_cls_tokens : n_cls_tokens + n_masked_patches
+ ]
+ elif do_ibot and self.ibot_separate_head:
+ buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound, _dim)
+ torch.index_select(
+ ibot_teacher_patch_tokens.flatten(0, 1),
+ dim=0,
+ index=mask_indices_list,
+ out=buffer_tensor_teacher[:n_masked_patches],
+ )
+ teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+ masked_teacher_patch_tokens_after_head = self.teacher.ibot_head(buffer_tensor_teacher)[
+ :n_masked_patches
+ ]
+ else:
+ teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+ masked_teacher_ibot_softmaxed_centered = None
+
+ if self.cfg.train.centering == "centering":
+ teacher_dino_softmaxed_centered_list = self.dino_loss.softmax_center_teacher(
+ teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+ ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+ self.dino_loss.update_center(teacher_cls_tokens_after_head)
+ if do_ibot:
+ masked_teacher_patch_tokens_after_head = masked_teacher_patch_tokens_after_head.unsqueeze(0)
+ masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.softmax_center_teacher(
+ masked_teacher_patch_tokens_after_head[:, :n_masked_patches], teacher_temp=teacher_temp
+ )
+ masked_teacher_ibot_softmaxed_centered = masked_teacher_ibot_softmaxed_centered.squeeze(0)
+ self.ibot_patch_loss.update_center(masked_teacher_patch_tokens_after_head[:n_masked_patches])
+
+ elif self.cfg.train.centering == "sinkhorn_knopp":
+ teacher_dino_softmaxed_centered_list = self.dino_loss.sinkhorn_knopp_teacher(
+ teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+ ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+
+ if do_ibot:
+ masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.sinkhorn_knopp_teacher(
+ masked_teacher_patch_tokens_after_head,
+ teacher_temp=teacher_temp,
+ n_masked_patches_tensor=n_masked_patches_tensor,
+ )
+
+ else:
+ raise NotImplementedError
+
+ return teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered
+
+ teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered = get_teacher_output()
+ reshard_fsdp_model(self.teacher)
+
+ loss_dict = {}
+
+ loss_accumulator = 0 # for backprop
+ student_global_backbone_output_dict, student_local_backbone_output_dict = self.student.backbone(
+ [global_crops, local_crops], masks=[masks, None], is_training=True
+ )
+
+ inputs_for_student_head_list = []
+
+ # 1a: local crops cls tokens
+ student_local_cls_tokens = student_local_backbone_output_dict["x_norm_clstoken"]
+ inputs_for_student_head_list.append(student_local_cls_tokens.unsqueeze(0))
+
+ # 1b: global crops cls tokens
+ student_global_cls_tokens = student_global_backbone_output_dict["x_norm_clstoken"]
+ inputs_for_student_head_list.append(student_global_cls_tokens.unsqueeze(0))
+
+ # 1c: global crops patch tokens
+ if do_ibot:
+ _dim = student_global_backbone_output_dict["x_norm_clstoken"].shape[-1]
+ ibot_student_patch_tokens = student_global_backbone_output_dict["x_norm_patchtokens"]
+ buffer_tensor_patch_tokens = ibot_student_patch_tokens.new_zeros(upperbound, _dim)
+ buffer_tensor_patch_tokens[:n_masked_patches].copy_(
+ torch.index_select(ibot_student_patch_tokens.flatten(0, 1), dim=0, index=mask_indices_list)
+ )
+ if not self.ibot_separate_head:
+ inputs_for_student_head_list.append(buffer_tensor_patch_tokens.unsqueeze(0))
+ else:
+ student_global_masked_patch_tokens_after_head = self.student.ibot_head(buffer_tensor_patch_tokens)[
+ :n_masked_patches
+ ]
+
+ # 2: run
+ _attn_bias, cat_inputs = fmha.BlockDiagonalMask.from_tensor_list(inputs_for_student_head_list)
+ outputs_list = _attn_bias.split(self.student.dino_head(cat_inputs))
+
+ # 3a: local crops cls tokens
+ student_local_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+ # 3b: global crops cls tokens
+ student_global_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+ # 3c: global crops patch tokens
+ if do_ibot and not self.ibot_separate_head:
+ student_global_masked_patch_tokens_after_head = outputs_list.pop(0).squeeze(0)[:n_masked_patches]
+
+ if n_local_crops > 0:
+ dino_local_crops_loss = self.dino_loss(
+ student_output_list=student_local_cls_tokens_after_head.chunk(n_local_crops),
+ teacher_out_softmaxed_centered_list=teacher_dino_softmaxed_centered_list,
+ ) / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+
+ # store for display
+ loss_dict["dino_local_crops_loss"] = dino_local_crops_loss
+
+ # accumulate loss
+ loss_accumulator += self.dino_loss_weight * dino_local_crops_loss
+
+ # process global crops
+ loss_scales = 2 # this is here since we process global crops together
+
+ if do_dino:
+ # compute loss
+ dino_global_crops_loss = (
+ self.dino_loss(
+ student_output_list=[student_global_cls_tokens_after_head],
+ teacher_out_softmaxed_centered_list=[
+ teacher_dino_softmaxed_centered_list.flatten(0, 1)
+ ], # these were chunked and stacked in reverse so A is matched to B
+ )
+ * loss_scales
+ / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+ )
+
+ loss_dict["dino_global_crops_loss"] = dino_global_crops_loss
+
+ # accumulate loss
+ loss_accumulator += self.dino_loss_weight * dino_global_crops_loss
+
+ student_cls_tokens = student_global_cls_tokens
+
+ if self.do_koleo:
+ koleo_loss = self.cfg.dino.koleo_loss_weight * sum(
+ self.koleo_loss(p) for p in student_cls_tokens.chunk(2)
+ ) # we don't apply koleo loss between cls tokens of a same image
+ loss_accumulator += koleo_loss
+ loss_dict["koleo_loss"] = (
+ koleo_loss / loss_scales
+ ) # this is to display the same losses as before but we can remove eventually
+
+ if do_ibot:
+ # compute loss
+ ibot_patch_loss = (
+ self.ibot_patch_loss.forward_masked(
+ student_global_masked_patch_tokens_after_head,
+ masked_teacher_ibot_softmaxed_centered,
+ student_masks_flat=masks,
+ n_masked_patches=n_masked_patches,
+ masks_weight=masks_weight,
+ )
+ * loss_scales
+ * ibot_loss_scale
+ )
+
+ # store for display
+ loss_dict["ibot_loss"] = ibot_patch_loss / 2
+
+ # accumulate loss
+ loss_accumulator += self.ibot_loss_weight * ibot_patch_loss
+
+ self.backprop_loss(loss_accumulator)
+
+ self.fsdp_synchronize_streams()
+
+ return loss_dict
+
+ def fsdp_synchronize_streams(self):
+ if self.need_to_synchronize_fsdp_streams:
+ torch.cuda.synchronize()
+ self.student.dino_head._streams = (
+ self.teacher.dino_head._streams
+ ) = self.student.backbone._streams = self.teacher.backbone._streams
+ self.need_to_synchronize_fsdp_streams = False
+
+ def update_teacher(self, m):
+ student_param_list = []
+ teacher_param_list = []
+ with torch.no_grad():
+ for k in self.student.keys():
+ for ms, mt in zip(get_fsdp_modules(self.student[k]), get_fsdp_modules(self.teacher[k])):
+ student_param_list += ms.params
+ teacher_param_list += mt.params
+ torch._foreach_mul_(teacher_param_list, m)
+ torch._foreach_add_(teacher_param_list, student_param_list, alpha=1 - m)
+
+ def train(self):
+ super().train()
+ self.teacher.eval()
+
+ def get_maybe_fused_params_for_submodel(self, m):
+ params_groups = get_params_groups_with_decay(
+ model=m,
+ lr_decay_rate=self.cfg.optim.layerwise_decay,
+ patch_embed_lr_mult=self.cfg.optim.patch_embed_lr_mult,
+ )
+ fused_params_groups = fuse_params_groups(params_groups)
+ logger.info("fusing param groups")
+
+ for g in fused_params_groups:
+ g["foreach"] = True
+ return fused_params_groups
+
+ def get_params_groups(self):
+ all_params_groups = []
+ for m in self.student.values():
+ all_params_groups += self.get_maybe_fused_params_for_submodel(m)
+ return all_params_groups
+
+ def prepare_for_distributed_training(self):
+ logger.info("DISTRIBUTED FSDP -- preparing model for distributed training")
+ if has_batchnorms(self.student):
+ raise NotImplementedError
+ # below will synchronize all student subnetworks across gpus:
+ for k, v in self.student.items():
+ self.teacher[k].load_state_dict(self.student[k].state_dict())
+ student_model_cfg = self.cfg.compute_precision.student[k]
+ self.student[k] = get_fsdp_wrapper(student_model_cfg, modules_to_wrap={BlockChunk})(self.student[k])
+ teacher_model_cfg = self.cfg.compute_precision.teacher[k]
+ self.teacher[k] = get_fsdp_wrapper(teacher_model_cfg, modules_to_wrap={BlockChunk})(self.teacher[k])
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..10bda1f489080e0e5f444b5d76883c400b8da04d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
@@ -0,0 +1,319 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import math
+import os
+from functools import partial
+
+from fvcore.common.checkpoint import PeriodicCheckpointer
+import torch
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data import SamplerType, make_data_loader, make_dataset
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.data import collate_data_and_cast, DataAugmentationDINO, MaskingGenerator
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.fsdp import FSDPCheckpointer
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import MetricLogger
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.config import setup
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils.utils import CosineScheduler
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.train.ssl_meta_arch import SSLMetaArch
+
+
+torch.backends.cuda.matmul.allow_tf32 = True # PyTorch 1.12 sets this to False by default
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(add_help: bool = True):
+ parser = argparse.ArgumentParser("DINOv2 training", add_help=add_help)
+ parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+ parser.add_argument(
+ "--no-resume",
+ action="store_true",
+ help="Whether to not attempt to resume from the checkpoint directory. ",
+ )
+ parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+ parser.add_argument("--eval", type=str, default="", help="Eval type to perform")
+ parser.add_argument(
+ "opts",
+ help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+ """.strip(),
+ default=None,
+ nargs=argparse.REMAINDER,
+ )
+ parser.add_argument(
+ "--output-dir",
+ "--output_dir",
+ default="",
+ type=str,
+ help="Output directory to save logs and checkpoints",
+ )
+
+ return parser
+
+
+def build_optimizer(cfg, params_groups):
+ return torch.optim.AdamW(params_groups, betas=(cfg.optim.adamw_beta1, cfg.optim.adamw_beta2))
+
+
+def build_schedulers(cfg):
+ OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+ lr = dict(
+ base_value=cfg.optim["lr"],
+ final_value=cfg.optim["min_lr"],
+ total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+ warmup_iters=cfg.optim["warmup_epochs"] * OFFICIAL_EPOCH_LENGTH,
+ start_warmup_value=0,
+ )
+ wd = dict(
+ base_value=cfg.optim["weight_decay"],
+ final_value=cfg.optim["weight_decay_end"],
+ total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+ )
+ momentum = dict(
+ base_value=cfg.teacher["momentum_teacher"],
+ final_value=cfg.teacher["final_momentum_teacher"],
+ total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+ )
+ teacher_temp = dict(
+ base_value=cfg.teacher["teacher_temp"],
+ final_value=cfg.teacher["teacher_temp"],
+ total_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+ warmup_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+ start_warmup_value=cfg.teacher["warmup_teacher_temp"],
+ )
+
+ lr_schedule = CosineScheduler(**lr)
+ wd_schedule = CosineScheduler(**wd)
+ momentum_schedule = CosineScheduler(**momentum)
+ teacher_temp_schedule = CosineScheduler(**teacher_temp)
+ last_layer_lr_schedule = CosineScheduler(**lr)
+
+ last_layer_lr_schedule.schedule[
+ : cfg.optim["freeze_last_layer_epochs"] * OFFICIAL_EPOCH_LENGTH
+ ] = 0 # mimicking the original schedules
+
+ logger.info("Schedulers ready.")
+
+ return (
+ lr_schedule,
+ wd_schedule,
+ momentum_schedule,
+ teacher_temp_schedule,
+ last_layer_lr_schedule,
+ )
+
+
+def apply_optim_scheduler(optimizer, lr, wd, last_layer_lr):
+ for param_group in optimizer.param_groups:
+ is_last_layer = param_group["is_last_layer"]
+ lr_multiplier = param_group["lr_multiplier"]
+ wd_multiplier = param_group["wd_multiplier"]
+ param_group["weight_decay"] = wd * wd_multiplier
+ param_group["lr"] = (last_layer_lr if is_last_layer else lr) * lr_multiplier
+
+
+def do_test(cfg, model, iteration):
+ new_state_dict = model.teacher.state_dict()
+
+ if distributed.is_main_process():
+ iterstring = str(iteration)
+ eval_dir = os.path.join(cfg.train.output_dir, "eval", iterstring)
+ os.makedirs(eval_dir, exist_ok=True)
+ # save teacher checkpoint
+ teacher_ckp_path = os.path.join(eval_dir, "teacher_checkpoint.pth")
+ torch.save({"teacher": new_state_dict}, teacher_ckp_path)
+
+
+def do_train(cfg, model, resume=False):
+ model.train()
+ inputs_dtype = torch.half
+ fp16_scaler = model.fp16_scaler # for mixed precision training
+
+ # setup optimizer
+
+ optimizer = build_optimizer(cfg, model.get_params_groups())
+ (
+ lr_schedule,
+ wd_schedule,
+ momentum_schedule,
+ teacher_temp_schedule,
+ last_layer_lr_schedule,
+ ) = build_schedulers(cfg)
+
+ # checkpointer
+ checkpointer = FSDPCheckpointer(model, cfg.train.output_dir, optimizer=optimizer, save_to_disk=True)
+
+ start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+
+ OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+ max_iter = cfg.optim.epochs * OFFICIAL_EPOCH_LENGTH
+
+ periodic_checkpointer = PeriodicCheckpointer(
+ checkpointer,
+ period=3 * OFFICIAL_EPOCH_LENGTH,
+ max_iter=max_iter,
+ max_to_keep=3,
+ )
+
+ # setup data preprocessing
+
+ img_size = cfg.crops.global_crops_size
+ patch_size = cfg.student.patch_size
+ n_tokens = (img_size // patch_size) ** 2
+ mask_generator = MaskingGenerator(
+ input_size=(img_size // patch_size, img_size // patch_size),
+ max_num_patches=0.5 * img_size // patch_size * img_size // patch_size,
+ )
+
+ data_transform = DataAugmentationDINO(
+ cfg.crops.global_crops_scale,
+ cfg.crops.local_crops_scale,
+ cfg.crops.local_crops_number,
+ global_crops_size=cfg.crops.global_crops_size,
+ local_crops_size=cfg.crops.local_crops_size,
+ )
+
+ collate_fn = partial(
+ collate_data_and_cast,
+ mask_ratio_tuple=cfg.ibot.mask_ratio_min_max,
+ mask_probability=cfg.ibot.mask_sample_probability,
+ n_tokens=n_tokens,
+ mask_generator=mask_generator,
+ dtype=inputs_dtype,
+ )
+
+ # setup data loader
+
+ dataset = make_dataset(
+ dataset_str=cfg.train.dataset_path,
+ transform=data_transform,
+ target_transform=lambda _: (),
+ )
+ # sampler_type = SamplerType.INFINITE
+ sampler_type = SamplerType.SHARDED_INFINITE
+ data_loader = make_data_loader(
+ dataset=dataset,
+ batch_size=cfg.train.batch_size_per_gpu,
+ num_workers=cfg.train.num_workers,
+ shuffle=True,
+ seed=start_iter, # TODO: Fix this -- cfg.train.seed
+ sampler_type=sampler_type,
+ sampler_advance=0, # TODO(qas): fix this -- start_iter * cfg.train.batch_size_per_gpu,
+ drop_last=True,
+ collate_fn=collate_fn,
+ )
+
+ # training loop
+
+ iteration = start_iter
+
+ logger.info("Starting training from iteration {}".format(start_iter))
+ metrics_file = os.path.join(cfg.train.output_dir, "training_metrics.json")
+ metric_logger = MetricLogger(delimiter=" ", output_file=metrics_file)
+ header = "Training"
+
+ for data in metric_logger.log_every(
+ data_loader,
+ 10,
+ header,
+ max_iter,
+ start_iter,
+ ):
+ current_batch_size = data["collated_global_crops"].shape[0] / 2
+ if iteration > max_iter:
+ return
+
+ # apply schedules
+
+ lr = lr_schedule[iteration]
+ wd = wd_schedule[iteration]
+ mom = momentum_schedule[iteration]
+ teacher_temp = teacher_temp_schedule[iteration]
+ last_layer_lr = last_layer_lr_schedule[iteration]
+ apply_optim_scheduler(optimizer, lr, wd, last_layer_lr)
+
+ # compute losses
+
+ optimizer.zero_grad(set_to_none=True)
+ loss_dict = model.forward_backward(data, teacher_temp=teacher_temp)
+
+ # clip gradients
+
+ if fp16_scaler is not None:
+ if cfg.optim.clip_grad:
+ fp16_scaler.unscale_(optimizer)
+ for v in model.student.values():
+ v.clip_grad_norm_(cfg.optim.clip_grad)
+ fp16_scaler.step(optimizer)
+ fp16_scaler.update()
+ else:
+ if cfg.optim.clip_grad:
+ for v in model.student.values():
+ v.clip_grad_norm_(cfg.optim.clip_grad)
+ optimizer.step()
+
+ # perform teacher EMA update
+
+ model.update_teacher(mom)
+
+ # logging
+
+ if distributed.get_global_size() > 1:
+ for v in loss_dict.values():
+ torch.distributed.all_reduce(v)
+ loss_dict_reduced = {k: v.item() / distributed.get_global_size() for k, v in loss_dict.items()}
+
+ if math.isnan(sum(loss_dict_reduced.values())):
+ logger.info("NaN detected")
+ raise AssertionError
+ losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+ metric_logger.update(lr=lr)
+ metric_logger.update(wd=wd)
+ metric_logger.update(mom=mom)
+ metric_logger.update(last_layer_lr=last_layer_lr)
+ metric_logger.update(current_batch_size=current_batch_size)
+ metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced)
+
+ # checkpointing and testing
+
+ if cfg.evaluation.eval_period_iterations > 0 and (iteration + 1) % cfg.evaluation.eval_period_iterations == 0:
+ do_test(cfg, model, f"training_{iteration}")
+ torch.cuda.synchronize()
+ periodic_checkpointer.step(iteration)
+
+ iteration = iteration + 1
+ metric_logger.synchronize_between_processes()
+ return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+def main(args):
+ cfg = setup(args)
+
+ model = SSLMetaArch(cfg).to(torch.device("cuda"))
+ model.prepare_for_distributed_training()
+
+ logger.info("Model:\n{}".format(model))
+ if args.eval_only:
+ iteration = (
+ FSDPCheckpointer(model, save_dir=cfg.train.output_dir)
+ .resume_or_load(cfg.MODEL.WEIGHTS, resume=not args.no_resume)
+ .get("iteration", -1)
+ + 1
+ )
+ return do_test(cfg, model, f"manual_{iteration}")
+
+ do_train(cfg, model, resume=not args.no_resume)
+
+
+if __name__ == "__main__":
+ args = get_args_parser(add_help=True).parse_args()
+ main(args)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4196294309799347172dba54a17360698071ca8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a78a44c5c8b742944f073b331b6480647643e0f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+class ClusterType(Enum):
+ AWS = "aws"
+ FAIR = "fair"
+ RSC = "rsc"
+
+
+def _guess_cluster_type() -> ClusterType:
+ uname = os.uname()
+ if uname.sysname == "Linux":
+ if uname.release.endswith("-aws"):
+ # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+ return ClusterType.AWS
+ elif uname.nodename.startswith("rsc"):
+ # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+ return ClusterType.RSC
+
+ return ClusterType.FAIR
+
+
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+ if cluster_type is None:
+ return _guess_cluster_type()
+
+ return cluster_type
+
+
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+ cluster_type = get_cluster_type(cluster_type)
+ if cluster_type is None:
+ return None
+
+ CHECKPOINT_DIRNAMES = {
+ ClusterType.AWS: "checkpoints",
+ ClusterType.FAIR: "checkpoint",
+ ClusterType.RSC: "checkpoint/dino",
+ }
+ return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+
+
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+ checkpoint_path = get_checkpoint_path(cluster_type)
+ if checkpoint_path is None:
+ return None
+
+ username = os.environ.get("USER")
+ assert username is not None
+ return checkpoint_path / username
+
+
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+ cluster_type = get_cluster_type(cluster_type)
+ if cluster_type is None:
+ return None
+
+ SLURM_PARTITIONS = {
+ ClusterType.AWS: "learnlab",
+ ClusterType.FAIR: "learnlab",
+ ClusterType.RSC: "learn",
+ }
+ return SLURM_PARTITIONS[cluster_type]
+
+
+def get_slurm_executor_parameters(
+ nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+ # create default parameters
+ params = {
+ "mem_gb": 0, # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+ "gpus_per_node": num_gpus_per_node,
+ "tasks_per_node": num_gpus_per_node, # one task per GPU
+ "cpus_per_task": 10,
+ "nodes": nodes,
+ "slurm_partition": get_slurm_partition(cluster_type),
+ }
+ # apply cluster-specific adjustments
+ cluster_type = get_cluster_type(cluster_type)
+ if cluster_type == ClusterType.AWS:
+ params["cpus_per_task"] = 12
+ del params["mem_gb"]
+ elif cluster_type == ClusterType.RSC:
+ params["cpus_per_task"] = 12
+ # set additional parameters / apply overrides
+ params.update(kwargs)
+ return params
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..35cc975fc331460c6a756d38884ab2d2ae98e321
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import logging
+import os
+
+from omegaconf import OmegaConf
+
+import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.distributed as distributed
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.logging import setup_logging
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.utils import utils
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.configs import dinov2_default_config
+
+
+logger = logging.getLogger("dinov2")
+
+
+def apply_scaling_rules_to_cfg(cfg): # to fix
+ if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+ base_lr = cfg.optim.base_lr
+ cfg.optim.lr = base_lr
+ cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+ logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+ else:
+ raise NotImplementedError
+ return cfg
+
+
+def write_config(cfg, output_dir, name="config.yaml"):
+ logger.info(OmegaConf.to_yaml(cfg))
+ saved_cfg_path = os.path.join(output_dir, name)
+ with open(saved_cfg_path, "w") as f:
+ OmegaConf.save(config=cfg, f=f)
+ return saved_cfg_path
+
+
+def get_cfg_from_args(args):
+ args.output_dir = os.path.abspath(args.output_dir)
+ args.opts += [f"train.output_dir={args.output_dir}"]
+ default_cfg = OmegaConf.create(dinov2_default_config)
+ cfg = OmegaConf.load(args.config_file)
+ cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+ return cfg
+
+
+def default_setup(args):
+ distributed.enable(overwrite=True)
+ seed = getattr(args, "seed", 0)
+ rank = distributed.get_global_rank()
+
+ global logger
+ setup_logging(output=args.output_dir, level=logging.INFO)
+ logger = logging.getLogger("dinov2")
+
+ utils.fix_random_seeds(seed + rank)
+ logger.info("git:\n {}\n".format(utils.get_sha()))
+ logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+
+
+def setup(args):
+ """
+ Create configs and perform basic setups.
+ """
+ cfg = get_cfg_from_args(args)
+ os.makedirs(args.output_dir, exist_ok=True)
+ default_setup(args)
+ apply_scaling_rules_to_cfg(cfg)
+ write_config(cfg, args.output_dir)
+ return cfg
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f25f8c0534f58e4b9b0394825aa42267f8678f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Union
+
+import numpy as np
+import torch
+
+
+TypeSpec = Union[str, np.dtype, torch.dtype]
+
+
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+ np.dtype("bool"): torch.bool,
+ np.dtype("uint8"): torch.uint8,
+ np.dtype("int8"): torch.int8,
+ np.dtype("int16"): torch.int16,
+ np.dtype("int32"): torch.int32,
+ np.dtype("int64"): torch.int64,
+ np.dtype("float16"): torch.float16,
+ np.dtype("float32"): torch.float32,
+ np.dtype("float64"): torch.float64,
+ np.dtype("complex64"): torch.complex64,
+ np.dtype("complex128"): torch.complex128,
+}
+
+
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+ if isinstance(dtype, torch.dtype):
+ return dtype
+ if isinstance(dtype, str):
+ dtype = np.dtype(dtype)
+ assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+ return _NUMPY_TO_TORCH_DTYPE[dtype]
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..61b4478c378755e427d20532f83f19fd18d8c309
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+ """
+ Calculate lr decay rate for different ViT blocks.
+ Args:
+ name (string): parameter name.
+ lr_decay_rate (float): base lr decay rate.
+ num_layers (int): number of ViT blocks.
+ Returns:
+ lr decay rate for the given parameter.
+ """
+ layer_id = num_layers + 1
+ if name.startswith("backbone") or force_is_backbone:
+ if ".pos_embed" in name or ".patch_embed" in name or ".mask_token" in name or ".cls_token" in name:
+ layer_id = 0
+ elif force_is_backbone and (
+ "pos_embed" in name or "patch_embed" in name or "mask_token" in name or "cls_token" in name
+ ):
+ layer_id = 0
+ elif ".blocks." in name and ".residual." not in name:
+ layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+ elif chunked_blocks and "blocks." in name and "residual." not in name:
+ layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+ elif "blocks." in name and "residual." not in name:
+ layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+
+ return lr_decay_rate ** (num_layers + 1 - layer_id)
+
+
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+ chunked_blocks = False
+ if hasattr(model, "n_blocks"):
+ logger.info("chunked fsdp")
+ n_blocks = model.n_blocks
+ chunked_blocks = model.chunked_blocks
+ elif hasattr(model, "blocks"):
+ logger.info("first code branch")
+ n_blocks = len(model.blocks)
+ elif hasattr(model, "backbone"):
+ logger.info("second code branch")
+ n_blocks = len(model.backbone.blocks)
+ else:
+ logger.info("else code branch")
+ n_blocks = 0
+ all_param_groups = []
+
+ for name, param in model.named_parameters():
+ name = name.replace("_fsdp_wrapped_module.", "")
+ if not param.requires_grad:
+ continue
+ decay_rate = get_vit_lr_decay_rate(
+ name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+ )
+ d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+
+ if "last_layer" in name:
+ d.update({"is_last_layer": True})
+
+ if name.endswith(".bias") or "norm" in name or "gamma" in name:
+ d.update({"wd_multiplier": 0.0})
+
+ if "patch_embed" in name:
+ d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+
+ all_param_groups.append(d)
+ logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+
+ return all_param_groups
+
+
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+ fused_params_groups = defaultdict(lambda: {"params": []})
+ for d in all_params_groups:
+ identifier = ""
+ for k in keys:
+ identifier += k + str(d[k]) + "_"
+
+ for k in keys:
+ fused_params_groups[identifier][k] = d[k]
+ fused_params_groups[identifier]["params"].append(d["params"])
+
+ return fused_params_groups.values()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe935f69d19f2b6500c5edf7d3e0c67f00e7bb99
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+
+import numpy as np
+import torch
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+ if urlparse(pretrained_weights).scheme: # If it looks like an URL
+ state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+ else:
+ state_dict = torch.load(pretrained_weights, map_location="cpu")
+ if checkpoint_key is not None and checkpoint_key in state_dict:
+ logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+ state_dict = state_dict[checkpoint_key]
+ # remove `module.` prefix
+ state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+ # remove `backbone.` prefix induced by multicrop wrapper
+ state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+ msg = model.load_state_dict(state_dict, strict=False)
+ logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+
+
+def fix_random_seeds(seed=31):
+ """
+ Fix random seeds.
+ """
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ np.random.seed(seed)
+ random.seed(seed)
+
+
+def get_sha():
+ cwd = os.path.dirname(os.path.abspath(__file__))
+
+ def _run(command):
+ return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+ sha = "N/A"
+ diff = "clean"
+ branch = "N/A"
+ try:
+ sha = _run(["git", "rev-parse", "HEAD"])
+ subprocess.check_output(["git", "diff"], cwd=cwd)
+ diff = _run(["git", "diff-index", "HEAD"])
+ diff = "has uncommitted changes" if diff else "clean"
+ branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+ except Exception:
+ pass
+ message = f"sha: {sha}, status: {diff}, branch: {branch}"
+ return message
+
+
+class CosineScheduler(object):
+ def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+ super().__init__()
+ self.final_value = final_value
+ self.total_iters = total_iters
+
+ freeze_schedule = np.zeros((freeze_iters))
+
+ warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+ iters = np.arange(total_iters - warmup_iters - freeze_iters)
+ schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+ self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+
+ assert len(self.schedule) == self.total_iters
+
+ def __getitem__(self, it):
+ if it >= self.total_iters:
+ return self.final_value
+ else:
+ return self.schedule[it]
+
+
+def has_batchnorms(model):
+ bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+ for name, module in model.named_modules():
+ if isinstance(module, bn_types):
+ return True
+ return False
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/hubconf.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1fc8e38f389d284df10b12141cf6f6bc37361b8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/hubconf.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Union
+
+import torch
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+ compact_arch_name = arch_name.replace("_", "")[:4]
+ registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+ return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class Weights(Enum):
+ LVD142M = "LVD142M"
+
+
+def _make_dinov2_model(
+ *,
+ arch_name: str = "vit_large",
+ img_size: int = 518,
+ patch_size: int = 14,
+ init_values: float = 1.0,
+ ffn_layer: str = "mlp",
+ block_chunks: int = 0,
+ num_register_tokens: int = 0,
+ interpolate_antialias: bool = False,
+ interpolate_offset: float = 0.1,
+ pretrained: bool = True,
+ weights: Union[Weights, str] = Weights.LVD142M,
+ **kwargs,
+):
+ import custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.vision_transformer as vits
+
+ if isinstance(weights, str):
+ try:
+ weights = Weights[weights]
+ except KeyError:
+ raise AssertionError(f"Unsupported weights: {weights}")
+
+ model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+ vit_kwargs = dict(
+ img_size=img_size,
+ patch_size=patch_size,
+ init_values=init_values,
+ ffn_layer=ffn_layer,
+ block_chunks=block_chunks,
+ num_register_tokens=num_register_tokens,
+ interpolate_antialias=interpolate_antialias,
+ interpolate_offset=interpolate_offset,
+ )
+ vit_kwargs.update(**kwargs)
+ model = vits.__dict__[arch_name](**vit_kwargs)
+
+ if pretrained:
+ model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+ url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+ state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+ model.load_state_dict(state_dict, strict=True)
+
+ return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(
+ arch_name="vit_giant2",
+ ffn_layer="swiglufused",
+ weights=weights,
+ pretrained=pretrained,
+ **kwargs,
+ )
+
+
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(
+ arch_name="vit_small",
+ pretrained=pretrained,
+ weights=weights,
+ num_register_tokens=4,
+ interpolate_antialias=True,
+ interpolate_offset=0.0,
+ **kwargs,
+ )
+
+
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(
+ arch_name="vit_base",
+ pretrained=pretrained,
+ weights=weights,
+ num_register_tokens=4,
+ interpolate_antialias=True,
+ interpolate_offset=0.0,
+ **kwargs,
+ )
+
+
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(
+ arch_name="vit_large",
+ pretrained=pretrained,
+ weights=weights,
+ num_register_tokens=4,
+ interpolate_antialias=True,
+ interpolate_offset=0.0,
+ **kwargs,
+ )
+
+
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+ """
+ DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+ """
+ return _make_dinov2_model(
+ arch_name="vit_giant2",
+ ffn_layer="swiglufused",
+ weights=weights,
+ pretrained=pretrained,
+ num_register_tokens=4,
+ interpolate_antialias=True,
+ interpolate_offset=0.0,
+ **kwargs,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/pyproject.toml b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..b724ba5be524e4e43349b11ca37f0dc556aa005d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/pyproject.toml
@@ -0,0 +1,29 @@
+[tool.black]
+line-length = 120
+
+[tool.pylint.master]
+persistent = false
+score = false
+
+[tool.pylint.messages_control]
+disable = "all"
+enable = [
+ "miscellaneous",
+ "similarities",
+]
+
+[tool.pylint.similarities]
+ignore-comments = true
+ignore-docstrings = true
+ignore-imports = true
+min-similarity-lines = 8
+
+[tool.pylint.reports]
+reports = false
+
+[tool.pylint.miscellaneous]
+notes = [
+ "FIXME",
+ "XXX",
+ "TODO",
+]
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/requirements-dev.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/requirements-dev.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e8b34df6cedf2fea0d1103811471615c7b540f3e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/requirements-dev.txt
@@ -0,0 +1,3 @@
+black==22.6.0
+flake8==5.0.4
+pylint==2.15.0
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/requirements.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f77a9677e9a102f6a8508276a154dfade5cd61be
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/requirements.txt
@@ -0,0 +1,11 @@
+--extra-index-url https://download.pytorch.org/whl/cu117
+torch==2.0.0
+torchvision==0.15.0
+omegaconf
+torchmetrics==0.10.3
+fvcore
+iopath
+xformers==0.0.18
+submitit
+--extra-index-url https://pypi.nvidia.com
+cuml-cu11
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/scripts/lint.sh b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/scripts/lint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1aa7e9e770b401d5156e72d8e09ed8f357411132
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/scripts/lint.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+if [ -n "$1" ]; then
+ echo "linting \"$1\""
+fi
+
+echo "running black"
+if [ -n "$1" ]; then
+ black "$1"
+else
+ black dinov2
+fi
+
+echo "running flake8"
+if [ -n "$1" ]; then
+ flake8 "$1"
+else
+ flake8
+fi
+
+echo "running pylint"
+if [ -n "$1" ]; then
+ pylint "$1"
+else
+ pylint dinov2
+fi
+
+exit 0
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/setup.cfg b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..228a970c2f8a163410160c2d305d31c36af7ab2b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/setup.cfg
@@ -0,0 +1,7 @@
+[flake8]
+max-line-length = 120
+ignore = E203,E501,W503
+per-file-ignores =
+ __init__.py:F401
+exclude =
+ venv
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/setup.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55e9094645915a566d07b8001dff25187ea8ede
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/setup.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+import re
+from typing import List, Tuple
+
+from setuptools import setup, find_packages
+
+
+NAME = "dinov2"
+DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method."
+
+URL = "https://github.com/facebookresearch/dinov2"
+AUTHOR = "FAIR"
+REQUIRES_PYTHON = ">=3.9.0"
+HERE = Path(__file__).parent
+
+
+try:
+ with open(HERE / "README.md", encoding="utf-8") as f:
+ long_description = "\n" + f.read()
+except FileNotFoundError:
+ long_description = DESCRIPTION
+
+
+def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]:
+ requirements = []
+ extra_indices = []
+ with open(path) as f:
+ for line in f.readlines():
+ line = line.rstrip("\r\n")
+ if line.startswith("--extra-index-url "):
+ extra_indices.append(line[18:])
+ continue
+ requirements.append(line)
+ return requirements, extra_indices
+
+
+def get_package_version() -> str:
+ with open(HERE / "dinov2/__init__.py") as f:
+ result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M)
+ if result:
+ return result.group(1)
+ raise RuntimeError("Can't get package version")
+
+
+requirements, extra_indices = get_requirements()
+version = get_package_version()
+dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt")
+
+
+setup(
+ name=NAME,
+ version=version,
+ description=DESCRIPTION,
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ author=AUTHOR,
+ python_requires=REQUIRES_PYTHON,
+ url=URL,
+ packages=find_packages(),
+ package_data={
+ "": ["*.yaml"],
+ },
+ install_requires=requirements,
+ dependency_links=extra_indices,
+ extras_require={
+ "dev": dev_requirements,
+ },
+ install_package_data=True,
+ license="CC-BY-NC",
+ license_files=("LICENSE",),
+ classifiers=[
+ # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py
+ "Development Status :: 3 - Alpha",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Science/Research",
+ "License :: Other/Proprietary License",
+ "Programming Language :: Python :: 3.9",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ ],
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7afea3273713518e891d1e6b8e86d58b4700fddc
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import itertools
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+ compact_arch_name = arch_name.replace("_", "")[:4]
+ registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+ return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class CenterPadding(nn.Module):
+ def __init__(self, multiple):
+ super().__init__()
+ self.multiple = multiple
+
+ def _get_pad(self, size):
+ new_size = math.ceil(size / self.multiple) * self.multiple
+ pad_size = new_size - size
+ pad_size_left = pad_size // 2
+ pad_size_right = pad_size - pad_size_left
+ return pad_size_left, pad_size_right
+
+ @torch.inference_mode()
+ def forward(self, x):
+ pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+ output = F.pad(x, pads)
+ return output
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/vision_transformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..48df632b9231ad28c57b37f1ac85ad54a437c3a7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything/torchhub/facebookresearch_dinov2_main/vision_transformer.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from custom_controlnet_aux.depth_anything.torchhub.facebookresearch_dinov2_main.dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+ if not depth_first and include_root:
+ fn(module=module, name=name)
+ for child_name, child_module in module.named_children():
+ child_name = ".".join((name, child_name)) if name else child_name
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+ if depth_first and include_root:
+ fn(module=module, name=name)
+ return module
+
+
+class BlockChunk(nn.ModuleList):
+ def forward(self, x):
+ for b in self:
+ x = b(x)
+ return x
+
+
+class DinoVisionTransformer(nn.Module):
+ def __init__(
+ self,
+ img_size=224,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ ffn_bias=True,
+ proj_bias=True,
+ drop_path_rate=0.0,
+ drop_path_uniform=False,
+ init_values=None, # for layerscale: None or 0 => no layerscale
+ embed_layer=PatchEmbed,
+ act_layer=nn.GELU,
+ block_fn=Block,
+ ffn_layer="mlp",
+ block_chunks=1,
+ num_register_tokens=0,
+ interpolate_antialias=False,
+ interpolate_offset=0.1,
+ ):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ proj_bias (bool): enable bias for proj in attn if True
+ ffn_bias (bool): enable bias for ffn if True
+ drop_path_rate (float): stochastic depth rate
+ drop_path_uniform (bool): apply uniform drop rate across blocks
+ weight_init (str): weight init scheme
+ init_values (float): layer-scale init values
+ embed_layer (nn.Module): patch embedding layer
+ act_layer (nn.Module): MLP activation layer
+ block_fn (nn.Module): transformer block class
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+ """
+ super().__init__()
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 1
+ self.n_blocks = depth
+ self.num_heads = num_heads
+ self.patch_size = patch_size
+ self.num_register_tokens = num_register_tokens
+ self.interpolate_antialias = interpolate_antialias
+ self.interpolate_offset = interpolate_offset
+
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+ assert num_register_tokens >= 0
+ self.register_tokens = (
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+ )
+
+ if drop_path_uniform is True:
+ dpr = [drop_path_rate] * depth
+ else:
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+
+ if ffn_layer == "mlp":
+ logger.info("using MLP layer as FFN")
+ ffn_layer = Mlp
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+ logger.info("using SwiGLU layer as FFN")
+ ffn_layer = SwiGLUFFNFused
+ elif ffn_layer == "identity":
+ logger.info("using Identity layer as FFN")
+
+ def f(*args, **kwargs):
+ return nn.Identity()
+
+ ffn_layer = f
+ else:
+ raise NotImplementedError
+
+ blocks_list = [
+ block_fn(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ ffn_bias=ffn_bias,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ ffn_layer=ffn_layer,
+ init_values=init_values,
+ )
+ for i in range(depth)
+ ]
+ if block_chunks > 0:
+ self.chunked_blocks = True
+ chunked_blocks = []
+ chunksize = depth // block_chunks
+ for i in range(0, depth, chunksize):
+ # this is to keep the block index consistent if we chunk the block list
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+ else:
+ self.chunked_blocks = False
+ self.blocks = nn.ModuleList(blocks_list)
+
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Identity()
+
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+ self.init_weights()
+
+ def init_weights(self):
+ trunc_normal_(self.pos_embed, std=0.02)
+ nn.init.normal_(self.cls_token, std=1e-6)
+ if self.register_tokens is not None:
+ nn.init.normal_(self.register_tokens, std=1e-6)
+ named_apply(init_weights_vit_timm, self)
+
+ def interpolate_pos_encoding(self, x, w, h):
+ previous_dtype = x.dtype
+ npatch = x.shape[1] - 1
+ N = self.pos_embed.shape[1] - 1
+ if npatch == N and w == h:
+ return self.pos_embed
+ pos_embed = self.pos_embed.float()
+ class_pos_embed = pos_embed[:, 0]
+ patch_pos_embed = pos_embed[:, 1:]
+ dim = x.shape[-1]
+ w0 = w // self.patch_size
+ h0 = h // self.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+ # w0, h0 = w0 + 0.1, h0 + 0.1
+
+ sqrt_N = math.sqrt(N)
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+ scale_factor=(sx, sy),
+ # (int(w0), int(h0)), # to solve the upsampling shape issue
+ mode="bicubic",
+ antialias=self.interpolate_antialias
+ )
+
+ assert int(w0) == patch_pos_embed.shape[-2]
+ assert int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+ def prepare_tokens_with_masks(self, x, masks=None):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x)
+ if masks is not None:
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+ x = x + self.interpolate_pos_encoding(x, w, h)
+
+ if self.register_tokens is not None:
+ x = torch.cat(
+ (
+ x[:, :1],
+ self.register_tokens.expand(x.shape[0], -1, -1),
+ x[:, 1:],
+ ),
+ dim=1,
+ )
+
+ return x
+
+ def forward_features_list(self, x_list, masks_list):
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+ for blk in self.blocks:
+ x = blk(x)
+
+ all_x = x
+ output = []
+ for x, masks in zip(all_x, masks_list):
+ x_norm = self.norm(x)
+ output.append(
+ {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+ )
+ return output
+
+ def forward_features(self, x, masks=None):
+ if isinstance(x, list):
+ return self.forward_features_list(x, masks)
+
+ x = self.prepare_tokens_with_masks(x, masks)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x_norm = self.norm(x)
+ return {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ # If n is an int, take the n last blocks. If it's a list, take them
+ output, total_block_len = [], len(self.blocks)
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def _get_intermediate_layers_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
+ # If n is an int, take the n last blocks. If it's a list, take them
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for block_chunk in self.blocks:
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ i += 1
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def get_intermediate_layers(
+ self,
+ x: torch.Tensor,
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
+ reshape: bool = False,
+ return_class_token: bool = False,
+ norm=True,
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+ if self.chunked_blocks:
+ outputs = self._get_intermediate_layers_chunked(x, n)
+ else:
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
+ if norm:
+ outputs = [self.norm(out) for out in outputs]
+ class_tokens = [out[:, 0] for out in outputs]
+ outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+ if reshape:
+ B, _, w, h = x.shape
+ outputs = [
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+ for out in outputs
+ ]
+ if return_class_token:
+ return tuple(zip(outputs, class_tokens))
+ return tuple(outputs)
+
+ def forward(self, *args, is_training=False, **kwargs):
+ ret = self.forward_features(*args, **kwargs)
+ if is_training:
+ return ret
+ else:
+ return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+ """ViT weight initialization, original timm impl (for reproducibility)"""
+ if isinstance(module, nn.Linear):
+ trunc_normal_(module.weight, std=0.02)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+ """
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+ """
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1536,
+ depth=40,
+ num_heads=24,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..290329c0011f41e61913e73027397a6c7e6a63f3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/__init__.py
@@ -0,0 +1,56 @@
+import numpy as np
+import torch
+from einops import repeat
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, DEPTH_ANYTHING_V2_MODEL_NAME_DICT
+from custom_controlnet_aux.depth_anything_v2.dpt import DepthAnythingV2
+import cv2
+import torch.nn.functional as F
+
+
+# https://github.com/DepthAnything/Depth-Anything-V2/blob/main/app.py
+model_configs = {
+ 'depth_anything_v2_vits.pth': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+ 'depth_anything_v2_vitb.pth': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+ 'depth_anything_v2_vitl.pth': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+ 'depth_anything_v2_vitg.pth': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]},
+ 'depth_anything_v2_metric_vkitti_vitl.pth': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+ 'depth_anything_v2_metric_hypersim_vitl.pth': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+}
+
+class DepthAnythingV2Detector:
+ def __init__(self, model, filename):
+ self.model = model
+ self.device = "cpu"
+ self.filename = filename
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=None, filename="depth_anything_v2_vits.pth"):
+ if pretrained_model_or_path is None:
+ pretrained_model_or_path = DEPTH_ANYTHING_V2_MODEL_NAME_DICT[filename]
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ model = DepthAnythingV2(**model_configs[filename])
+ model.load_state_dict(torch.load(model_path, map_location="cpu"))
+ model = model.eval()
+ return cls(model, filename)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", max_depth=20.0, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+
+ depth = self.model.infer_image(cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR), input_size=518, max_depth=max_depth)
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+ depth = depth.astype(np.uint8)
+ if 'metric' in self.filename:
+ depth = 255 - depth
+
+ detected_map = repeat(depth, "h w -> h w 3")
+ detected_map, remove_pad = resize_image_with_pad(detected_map, detect_resolution, upscale_method)
+ detected_map = remove_pad(detected_map)
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f67b6cdd4125b624b38fc9f7fd96dc6436c4e2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2.py
@@ -0,0 +1,415 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from custom_controlnet_aux.depth_anything_v2.dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+ if not depth_first and include_root:
+ fn(module=module, name=name)
+ for child_name, child_module in module.named_children():
+ child_name = ".".join((name, child_name)) if name else child_name
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+ if depth_first and include_root:
+ fn(module=module, name=name)
+ return module
+
+
+class BlockChunk(nn.ModuleList):
+ def forward(self, x):
+ for b in self:
+ x = b(x)
+ return x
+
+
+class DinoVisionTransformer(nn.Module):
+ def __init__(
+ self,
+ img_size=224,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ ffn_bias=True,
+ proj_bias=True,
+ drop_path_rate=0.0,
+ drop_path_uniform=False,
+ init_values=None, # for layerscale: None or 0 => no layerscale
+ embed_layer=PatchEmbed,
+ act_layer=nn.GELU,
+ block_fn=Block,
+ ffn_layer="mlp",
+ block_chunks=1,
+ num_register_tokens=0,
+ interpolate_antialias=False,
+ interpolate_offset=0.1,
+ ):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ proj_bias (bool): enable bias for proj in attn if True
+ ffn_bias (bool): enable bias for ffn if True
+ drop_path_rate (float): stochastic depth rate
+ drop_path_uniform (bool): apply uniform drop rate across blocks
+ weight_init (str): weight init scheme
+ init_values (float): layer-scale init values
+ embed_layer (nn.Module): patch embedding layer
+ act_layer (nn.Module): MLP activation layer
+ block_fn (nn.Module): transformer block class
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+ """
+ super().__init__()
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 1
+ self.n_blocks = depth
+ self.num_heads = num_heads
+ self.patch_size = patch_size
+ self.num_register_tokens = num_register_tokens
+ self.interpolate_antialias = interpolate_antialias
+ self.interpolate_offset = interpolate_offset
+
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+ assert num_register_tokens >= 0
+ self.register_tokens = (
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+ )
+
+ if drop_path_uniform is True:
+ dpr = [drop_path_rate] * depth
+ else:
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+
+ if ffn_layer == "mlp":
+ logger.info("using MLP layer as FFN")
+ ffn_layer = Mlp
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+ logger.info("using SwiGLU layer as FFN")
+ ffn_layer = SwiGLUFFNFused
+ elif ffn_layer == "identity":
+ logger.info("using Identity layer as FFN")
+
+ def f(*args, **kwargs):
+ return nn.Identity()
+
+ ffn_layer = f
+ else:
+ raise NotImplementedError
+
+ blocks_list = [
+ block_fn(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ ffn_bias=ffn_bias,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ ffn_layer=ffn_layer,
+ init_values=init_values,
+ )
+ for i in range(depth)
+ ]
+ if block_chunks > 0:
+ self.chunked_blocks = True
+ chunked_blocks = []
+ chunksize = depth // block_chunks
+ for i in range(0, depth, chunksize):
+ # this is to keep the block index consistent if we chunk the block list
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+ else:
+ self.chunked_blocks = False
+ self.blocks = nn.ModuleList(blocks_list)
+
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Identity()
+
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+ self.init_weights()
+
+ def init_weights(self):
+ trunc_normal_(self.pos_embed, std=0.02)
+ nn.init.normal_(self.cls_token, std=1e-6)
+ if self.register_tokens is not None:
+ nn.init.normal_(self.register_tokens, std=1e-6)
+ named_apply(init_weights_vit_timm, self)
+
+ def interpolate_pos_encoding(self, x, w, h):
+ previous_dtype = x.dtype
+ npatch = x.shape[1] - 1
+ N = self.pos_embed.shape[1] - 1
+ if npatch == N and w == h:
+ return self.pos_embed
+ pos_embed = self.pos_embed.float()
+ class_pos_embed = pos_embed[:, 0]
+ patch_pos_embed = pos_embed[:, 1:]
+ dim = x.shape[-1]
+ w0 = w // self.patch_size
+ h0 = h // self.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+ # w0, h0 = w0 + 0.1, h0 + 0.1
+
+ sqrt_N = math.sqrt(N)
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+ scale_factor=(sx, sy),
+ # (int(w0), int(h0)), # to solve the upsampling shape issue
+ mode="bicubic",
+ antialias=self.interpolate_antialias
+ )
+
+ assert int(w0) == patch_pos_embed.shape[-2]
+ assert int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+ def prepare_tokens_with_masks(self, x, masks=None):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x)
+ if masks is not None:
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+ x = x + self.interpolate_pos_encoding(x, w, h)
+
+ if self.register_tokens is not None:
+ x = torch.cat(
+ (
+ x[:, :1],
+ self.register_tokens.expand(x.shape[0], -1, -1),
+ x[:, 1:],
+ ),
+ dim=1,
+ )
+
+ return x
+
+ def forward_features_list(self, x_list, masks_list):
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+ for blk in self.blocks:
+ x = blk(x)
+
+ all_x = x
+ output = []
+ for x, masks in zip(all_x, masks_list):
+ x_norm = self.norm(x)
+ output.append(
+ {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+ )
+ return output
+
+ def forward_features(self, x, masks=None):
+ if isinstance(x, list):
+ return self.forward_features_list(x, masks)
+
+ x = self.prepare_tokens_with_masks(x, masks)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x_norm = self.norm(x)
+ return {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ # If n is an int, take the n last blocks. If it's a list, take them
+ output, total_block_len = [], len(self.blocks)
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def _get_intermediate_layers_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
+ # If n is an int, take the n last blocks. If it's a list, take them
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for block_chunk in self.blocks:
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ i += 1
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def get_intermediate_layers(
+ self,
+ x: torch.Tensor,
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
+ reshape: bool = False,
+ return_class_token: bool = False,
+ norm=True
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+ if self.chunked_blocks:
+ outputs = self._get_intermediate_layers_chunked(x, n)
+ else:
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
+ if norm:
+ outputs = [self.norm(out) for out in outputs]
+ class_tokens = [out[:, 0] for out in outputs]
+ outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+ if reshape:
+ B, _, w, h = x.shape
+ outputs = [
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+ for out in outputs
+ ]
+ if return_class_token:
+ return tuple(zip(outputs, class_tokens))
+ return tuple(outputs)
+
+ def forward(self, *args, is_training=False, **kwargs):
+ ret = self.forward_features(*args, **kwargs)
+ if is_training:
+ return ret
+ else:
+ return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+ """ViT weight initialization, original timm impl (for reproducibility)"""
+ if isinstance(module, nn.Linear):
+ trunc_normal_(module.weight, std=0.02)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+ """
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+ """
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1536,
+ depth=40,
+ num_heads=24,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def DINOv2(model_name):
+ model_zoo = {
+ "vits": vit_small,
+ "vitb": vit_base,
+ "vitl": vit_large,
+ "vitg": vit_giant2
+ }
+
+ return model_zoo[model_name](
+ img_size=518,
+ patch_size=14,
+ init_values=1.0,
+ ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+ block_chunks=0,
+ num_register_tokens=0,
+ interpolate_antialias=False,
+ interpolate_offset=0.1
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59a83eb90512d763b03e4d38536b6ae07e87541
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/attention.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..dea0c82d55f052bf4bcb5896ad8c37158ef523d5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/attention.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+ from xformers.ops import memory_efficient_attention, unbind, fmha
+
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ logger.warning("xFormers not available")
+ XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ attn_drop: float = 0.0,
+ proj_drop: float = 0.0,
+ ) -> None:
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x: Tensor) -> Tensor:
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+ attn = q @ k.transpose(-2, -1)
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class MemEffAttention(Attention):
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ if not XFORMERS_AVAILABLE:
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
+ return super().forward(x)
+
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+ q, k, v = unbind(qkv, 2)
+
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+ x = x.reshape([B, N, C])
+
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/block.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..f91f3f07bd15fba91c67068c8dce2bb22d505bf7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/block.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+ from xformers.ops import fmha
+ from xformers.ops import scaled_index_add, index_select_cat
+
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ logger.warning("xFormers not available")
+ XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ ffn_bias: bool = True,
+ drop: float = 0.0,
+ attn_drop: float = 0.0,
+ init_values=None,
+ drop_path: float = 0.0,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+ attn_class: Callable[..., nn.Module] = Attention,
+ ffn_layer: Callable[..., nn.Module] = Mlp,
+ ) -> None:
+ super().__init__()
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+ self.norm1 = norm_layer(dim)
+ self.attn = attn_class(
+ dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ attn_drop=attn_drop,
+ proj_drop=drop,
+ )
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = ffn_layer(
+ in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop,
+ bias=ffn_bias,
+ )
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.sample_drop_ratio = drop_path
+
+ def forward(self, x: Tensor) -> Tensor:
+ def attn_residual_func(x: Tensor) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x)))
+
+ def ffn_residual_func(x: Tensor) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ if self.training and self.sample_drop_ratio > 0.1:
+ # the overhead is compensated only for a drop path rate larger than 0.1
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ )
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ )
+ elif self.training and self.sample_drop_ratio > 0.0:
+ x = x + self.drop_path1(attn_residual_func(x))
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
+ else:
+ x = x + attn_residual_func(x)
+ x = x + ffn_residual_func(x)
+ return x
+
+
+def drop_add_residual_stochastic_depth(
+ x: Tensor,
+ residual_func: Callable[[Tensor], Tensor],
+ sample_drop_ratio: float = 0.0,
+) -> Tensor:
+ # 1) extract subset using permutation
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ x_subset = x[brange]
+
+ # 2) apply residual_func to get residual
+ residual = residual_func(x_subset)
+
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+
+ residual_scale_factor = b / sample_subset_size
+
+ # 3) add the residual
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ residual_scale_factor = b / sample_subset_size
+ return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+ if scaling_vector is None:
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ else:
+ x_plus_residual = scaled_index_add(
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+ )
+ return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+ """
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
+ """
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+ if all_shapes not in attn_bias_cache.keys():
+ seqlens = []
+ for b, x in zip(batch_sizes, x_list):
+ for _ in range(b):
+ seqlens.append(x.shape[1])
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+ attn_bias._batch_sizes = batch_sizes
+ attn_bias_cache[all_shapes] = attn_bias
+
+ if branges is not None:
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+ else:
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+ return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+ x_list: List[Tensor],
+ residual_func: Callable[[Tensor, Any], Tensor],
+ sample_drop_ratio: float = 0.0,
+ scaling_vector=None,
+) -> Tensor:
+ # 1) generate random set of indices for dropping samples in the batch
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+ branges = [s[0] for s in branges_scales]
+ residual_scale_factors = [s[1] for s in branges_scales]
+
+ # 2) get attention bias and index+concat the tensors
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+ # 3) apply residual_func to get residual, and split the result
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
+
+ outputs = []
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+ return outputs
+
+
+class NestedTensorBlock(Block):
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+ """
+ x_list contains a list of tensors to nest together and run
+ """
+ assert isinstance(self.attn, MemEffAttention)
+
+ if self.training and self.sample_drop_ratio > 0.0:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.mlp(self.norm2(x))
+
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ return x_list
+ else:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ attn_bias, x = get_attn_bias_and_cat(x_list)
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
+ x = x + ffn_residual_func(x)
+ return attn_bias.split(x)
+
+ def forward(self, x_or_x_list):
+ if isinstance(x_or_x_list, Tensor):
+ return super().forward(x_or_x_list)
+ elif isinstance(x_or_x_list, list):
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+ return self.forward_nested(x_or_x_list)
+ else:
+ raise AssertionError
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/drop_path.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c3bea8e40eec258bbe59087770d230a6375481
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+ if drop_prob == 0.0 or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0:
+ random_tensor.div_(keep_prob)
+ output = x * random_tensor
+ return output
+
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/layer_scale.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/layer_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a4d0eedb1dc974a45e06fbe77ff3d909e36e55
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/layer_scale.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ init_values: Union[float, Tensor] = 1e-5,
+ inplace: bool = False,
+ ) -> None:
+ super().__init__()
+ self.inplace = inplace
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+ def forward(self, x: Tensor) -> Tensor:
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/mlp.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..504987b635c9cd582a352fb2381228c9e6cd043c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x: Tensor) -> Tensor:
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/patch_embed.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f880c042ee6a33ef520c6a8c8a686c1d065b8f49
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/patch_embed.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+ if isinstance(x, tuple):
+ assert len(x) == 2
+ return x
+
+ assert isinstance(x, int)
+ return (x, x)
+
+
+class PatchEmbed(nn.Module):
+ """
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+ Args:
+ img_size: Image size.
+ patch_size: Patch token size.
+ in_chans: Number of input image channels.
+ embed_dim: Number of linear projection output channels.
+ norm_layer: Normalization layer.
+ """
+
+ def __init__(
+ self,
+ img_size: Union[int, Tuple[int, int]] = 224,
+ patch_size: Union[int, Tuple[int, int]] = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ norm_layer: Optional[Callable] = None,
+ flatten_embedding: bool = True,
+ ) -> None:
+ super().__init__()
+
+ image_HW = make_2tuple(img_size)
+ patch_HW = make_2tuple(patch_size)
+ patch_grid_size = (
+ image_HW[0] // patch_HW[0],
+ image_HW[1] // patch_HW[1],
+ )
+
+ self.img_size = image_HW
+ self.patch_size = patch_HW
+ self.patches_resolution = patch_grid_size
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ self.flatten_embedding = flatten_embedding
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ def forward(self, x: Tensor) -> Tensor:
+ _, _, H, W = x.shape
+ patch_H, patch_W = self.patch_size
+
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+ x = self.proj(x) # B C H W
+ H, W = x.size(2), x.size(3)
+ x = x.flatten(2).transpose(1, 2) # B HW C
+ x = self.norm(x)
+ if not self.flatten_embedding:
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
+ return x
+
+ def flops(self) -> float:
+ Ho, Wo = self.patches_resolution
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+ if self.norm is not None:
+ flops += Ho * Wo * self.embed_dim
+ return flops
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/swiglu_ffn.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/swiglu_ffn.py
new file mode 100644
index 0000000000000000000000000000000000000000..155a3dd9f6f1a7d0f7bdf9c8f1981e58acb3b19c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dinov2_layers/swiglu_ffn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+ def forward(self, x: Tensor) -> Tensor:
+ x12 = self.w12(x)
+ x1, x2 = x12.chunk(2, dim=-1)
+ hidden = F.silu(x1) * x2
+ return self.w3(hidden)
+
+
+try:
+ from xformers.ops import SwiGLU
+
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ SwiGLU = SwiGLUFFN
+ XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+ super().__init__(
+ in_features=in_features,
+ hidden_features=hidden_features,
+ out_features=out_features,
+ bias=bias,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dpt.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a96e8743743cb957a6ff12a9b4646a7c53ae4a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/dpt.py
@@ -0,0 +1,220 @@
+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+
+from custom_controlnet_aux.depth_anything_v2.dinov2 import DINOv2
+from custom_controlnet_aux.depth_anything_v2.util.blocks import FeatureFusionBlock, _make_scratch
+from custom_controlnet_aux.depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet
+
+
+def _make_fusion_block(features, use_bn, size=None):
+ return FeatureFusionBlock(
+ features,
+ nn.ReLU(False),
+ deconv=False,
+ bn=use_bn,
+ expand=False,
+ align_corners=True,
+ size=size,
+ )
+
+
+class ConvBlock(nn.Module):
+ def __init__(self, in_feature, out_feature):
+ super().__init__()
+
+ self.conv_block = nn.Sequential(
+ nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+ nn.BatchNorm2d(out_feature),
+ nn.ReLU(True)
+ )
+
+ def forward(self, x):
+ return self.conv_block(x)
+
+
+class DPTHead(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ features=256,
+ use_bn=False,
+ out_channels=[256, 512, 1024, 1024],
+ use_clstoken=False
+ ):
+ super(DPTHead, self).__init__()
+
+ self.use_clstoken = use_clstoken
+
+ self.projects = nn.ModuleList([
+ nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_channel,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ) for out_channel in out_channels
+ ])
+
+ self.resize_layers = nn.ModuleList([
+ nn.ConvTranspose2d(
+ in_channels=out_channels[0],
+ out_channels=out_channels[0],
+ kernel_size=4,
+ stride=4,
+ padding=0),
+ nn.ConvTranspose2d(
+ in_channels=out_channels[1],
+ out_channels=out_channels[1],
+ kernel_size=2,
+ stride=2,
+ padding=0),
+ nn.Identity(),
+ nn.Conv2d(
+ in_channels=out_channels[3],
+ out_channels=out_channels[3],
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ ])
+
+ if use_clstoken:
+ self.readout_projects = nn.ModuleList()
+ for _ in range(len(self.projects)):
+ self.readout_projects.append(
+ nn.Sequential(
+ nn.Linear(2 * in_channels, in_channels),
+ nn.GELU()))
+
+ self.scratch = _make_scratch(
+ out_channels,
+ features,
+ groups=1,
+ expand=False,
+ )
+
+ self.scratch.stem_transpose = None
+
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+ head_features_1 = features
+ head_features_2 = 32
+
+ self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+ self.scratch.output_conv2 = nn.Sequential(
+ nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(True),
+ nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+ nn.ReLU(True),
+ nn.Identity(),
+ )
+
+ def forward(self, out_features, patch_h, patch_w):
+ out = []
+ for i, x in enumerate(out_features):
+ if self.use_clstoken:
+ x, cls_token = x[0], x[1]
+ readout = cls_token.unsqueeze(1).expand_as(x)
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
+ else:
+ x = x[0]
+
+ x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+
+ x = self.projects[i](x)
+ x = self.resize_layers[i](x)
+
+ out.append(x)
+
+ layer_1, layer_2, layer_3, layer_4 = out
+
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+ path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+ out = self.scratch.output_conv1(path_1)
+ out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+ out = self.scratch.output_conv2(out)
+
+ return out
+
+
+class DepthAnythingV2(nn.Module):
+ def __init__(
+ self,
+ encoder='vitl',
+ features=256,
+ out_channels=[256, 512, 1024, 1024],
+ use_bn=False,
+ use_clstoken=False
+ ):
+ super(DepthAnythingV2, self).__init__()
+
+ self.intermediate_layer_idx = {
+ 'vits': [2, 5, 8, 11],
+ 'vitb': [2, 5, 8, 11],
+ 'vitl': [4, 11, 17, 23],
+ 'vitg': [9, 19, 29, 39]
+ }
+
+ self.encoder = encoder
+ self.pretrained = DINOv2(model_name=encoder)
+
+ self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+
+ def forward(self, x, max_depth):
+ patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
+
+ features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
+
+ depth = self.depth_head(features, patch_h, patch_w) * max_depth
+
+ return depth.squeeze(1)
+
+ @torch.no_grad()
+ def infer_image(self, raw_image, input_size=518, max_depth=20.0):
+ image, (h, w) = self.image2tensor(raw_image, input_size)
+
+ depth = self.forward(image, max_depth)
+
+ depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+
+ return depth.cpu().numpy()
+
+ def image2tensor(self, raw_image, input_size=518):
+ transform = Compose([
+ Resize(
+ width=input_size,
+ height=input_size,
+ resize_target=False,
+ keep_aspect_ratio=True,
+ ensure_multiple_of=14,
+ resize_method='lower_bound',
+ image_interpolation_method=cv2.INTER_CUBIC,
+ ),
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+ PrepareForNet(),
+ ])
+
+ h, w = raw_image.shape[:2]
+
+ image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+
+ image = transform({'image': image})['image']
+ image = torch.from_numpy(image).unsqueeze(0)
+
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+ image = image.to(DEVICE)
+
+ return image, (h, w)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/util/blocks.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/util/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb66c03702d653f411c59ab9966916c348c7c6e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/util/blocks.py
@@ -0,0 +1,148 @@
+import torch.nn as nn
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+ scratch = nn.Module()
+
+ out_shape1 = out_shape
+ out_shape2 = out_shape
+ out_shape3 = out_shape
+ if len(in_shape) >= 4:
+ out_shape4 = out_shape
+
+ if expand:
+ out_shape1 = out_shape
+ out_shape2 = out_shape * 2
+ out_shape3 = out_shape * 4
+ if len(in_shape) >= 4:
+ out_shape4 = out_shape * 8
+
+ scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+ scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+ scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+ if len(in_shape) >= 4:
+ scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+
+ return scratch
+
+
+class ResidualConvUnit(nn.Module):
+ """Residual convolution module.
+ """
+
+ def __init__(self, features, activation, bn):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.bn = bn
+
+ self.groups=1
+
+ self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+
+ self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+
+ if self.bn == True:
+ self.bn1 = nn.BatchNorm2d(features)
+ self.bn2 = nn.BatchNorm2d(features)
+
+ self.activation = activation
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ """Forward pass.
+
+ Args:
+ x (tensor): input
+
+ Returns:
+ tensor: output
+ """
+
+ out = self.activation(x)
+ out = self.conv1(out)
+ if self.bn == True:
+ out = self.bn1(out)
+
+ out = self.activation(out)
+ out = self.conv2(out)
+ if self.bn == True:
+ out = self.bn2(out)
+
+ if self.groups > 1:
+ out = self.conv_merge(out)
+
+ return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock(nn.Module):
+ """Feature fusion block.
+ """
+
+ def __init__(
+ self,
+ features,
+ activation,
+ deconv=False,
+ bn=False,
+ expand=False,
+ align_corners=True,
+ size=None
+ ):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock, self).__init__()
+
+ self.deconv = deconv
+ self.align_corners = align_corners
+
+ self.groups=1
+
+ self.expand = expand
+ out_features = features
+ if self.expand == True:
+ out_features = features // 2
+
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+
+ self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+ self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ self.size=size
+
+ def forward(self, *xs, size=None):
+ """Forward pass.
+
+ Returns:
+ tensor: output
+ """
+ output = xs[0]
+
+ if len(xs) == 2:
+ res = self.resConfUnit1(xs[1])
+ output = self.skip_add.add(output, res)
+
+ output = self.resConfUnit2(output)
+
+ if (size is None) and (self.size is None):
+ modifier = {"scale_factor": 2}
+ elif size is None:
+ modifier = {"size": self.size}
+ else:
+ modifier = {"size": size}
+
+ output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+
+ output = self.out_conv(output)
+
+ return output
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/util/transform.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/util/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cce234c86177e1ad5c84c81c7c1afb16877c9da
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/depth_anything_v2/util/transform.py
@@ -0,0 +1,158 @@
+import numpy as np
+import cv2
+
+
+class Resize(object):
+ """Resize sample to given size (width, height).
+ """
+
+ def __init__(
+ self,
+ width,
+ height,
+ resize_target=True,
+ keep_aspect_ratio=False,
+ ensure_multiple_of=1,
+ resize_method="lower_bound",
+ image_interpolation_method=cv2.INTER_AREA,
+ ):
+ """Init.
+
+ Args:
+ width (int): desired output width
+ height (int): desired output height
+ resize_target (bool, optional):
+ True: Resize the full sample (image, mask, target).
+ False: Resize image only.
+ Defaults to True.
+ keep_aspect_ratio (bool, optional):
+ True: Keep the aspect ratio of the input sample.
+ Output sample might not have the given width and height, and
+ resize behaviour depends on the parameter 'resize_method'.
+ Defaults to False.
+ ensure_multiple_of (int, optional):
+ Output width and height is constrained to be multiple of this parameter.
+ Defaults to 1.
+ resize_method (str, optional):
+ "lower_bound": Output will be at least as large as the given size.
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
+ Defaults to "lower_bound".
+ """
+ self.__width = width
+ self.__height = height
+
+ self.__resize_target = resize_target
+ self.__keep_aspect_ratio = keep_aspect_ratio
+ self.__multiple_of = ensure_multiple_of
+ self.__resize_method = resize_method
+ self.__image_interpolation_method = image_interpolation_method
+
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ if max_val is not None and y > max_val:
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ if y < min_val:
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ return y
+
+ def get_size(self, width, height):
+ # determine new height and width
+ scale_height = self.__height / height
+ scale_width = self.__width / width
+
+ if self.__keep_aspect_ratio:
+ if self.__resize_method == "lower_bound":
+ # scale such that output size is lower bound
+ if scale_width > scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "upper_bound":
+ # scale such that output size is upper bound
+ if scale_width < scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "minimal":
+ # scale as least as possbile
+ if abs(1 - scale_width) < abs(1 - scale_height):
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ else:
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
+
+ if self.__resize_method == "lower_bound":
+ new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+ new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+ elif self.__resize_method == "upper_bound":
+ new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+ new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+ elif self.__resize_method == "minimal":
+ new_height = self.constrain_to_multiple_of(scale_height * height)
+ new_width = self.constrain_to_multiple_of(scale_width * width)
+ else:
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
+
+ return (new_width, new_height)
+
+ def __call__(self, sample):
+ width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+
+ # resize sample
+ sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+
+ if self.__resize_target:
+ if "depth" in sample:
+ sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+
+ if "mask" in sample:
+ sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+
+ return sample
+
+
+class NormalizeImage(object):
+ """Normlize image by given mean and std.
+ """
+
+ def __init__(self, mean, std):
+ self.__mean = mean
+ self.__std = std
+
+ def __call__(self, sample):
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
+
+ return sample
+
+
+class PrepareForNet(object):
+ """Prepare sample for usage as network input.
+ """
+
+ def __init__(self):
+ pass
+
+ def __call__(self, sample):
+ image = np.transpose(sample["image"], (2, 0, 1))
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+
+ if "depth" in sample:
+ depth = sample["depth"].astype(np.float32)
+ sample["depth"] = np.ascontiguousarray(depth)
+
+ if "mask" in sample:
+ sample["mask"] = sample["mask"].astype(np.float32)
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
+
+ return sample
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89343eefc1dd579f4b0ee1fc9399d9244dbc07dd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/__init__.py
@@ -0,0 +1,40 @@
+from custom_controlnet_aux.diffusion_edge.model import DiffusionEdge, prepare_args
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, DIFFUSION_EDGE_MODEL_NAME
+
+class DiffusionEdgeDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=DIFFUSION_EDGE_MODEL_NAME, filename="diffusion_edge_indoor.pt"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ model = DiffusionEdge(prepare_args(model_path))
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, patch_batch_size=8, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ with torch.no_grad():
+ input_image = rearrange(torch.from_numpy(input_image), "h w c -> 1 c h w")
+ input_image = input_image.float() / 255.
+ line = self.model(input_image, patch_batch_size)
+ line = rearrange(line, "1 c h w -> h w c")
+
+ detected_map = line.cpu().numpy().__mul__(255.).astype(np.uint8)
+ detected_map = remove_pad(HWC3(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/default.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd1df70885171e4fb220fa41f1aae1d7028a1e0b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/default.yaml
@@ -0,0 +1,74 @@
+model:
+ model_type: const_sde
+ model_name: cond_unet
+ image_size: [320, 320]
+ input_keys: ['image', 'cond']
+ ckpt_path:
+ ignore_keys: [ ]
+ only_model: False
+ timesteps: 1000
+ train_sample: -1
+ sampling_timesteps: 1
+ loss_type: l2
+ objective: pred_noise
+ start_dist: normal
+ perceptual_weight: 0
+ scale_factor: 0.3
+ scale_by_std: True
+ default_scale: True
+ scale_by_softsign: False
+ eps: !!float 1e-4
+ weighting_loss: False
+ first_stage:
+ embed_dim: 3
+ lossconfig:
+ disc_start: 50001
+ kl_weight: 0.000001
+ disc_weight: 0.5
+ disc_in_channels: 1
+ ddconfig:
+ double_z: True
+ z_channels: 3
+ resolution: [ 320, 320 ]
+ in_channels: 1
+ out_ch: 1
+ ch: 128
+ ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
+ num_res_blocks: 2
+ attn_resolutions: [ ]
+ dropout: 0.0
+ ckpt_path:
+ unet:
+ dim: 128
+ cond_net: swin
+ without_pretrain: False
+ channels: 3
+ out_mul: 1
+ dim_mults: [ 1, 2, 4, 4, ] # num_down = len(dim_mults)
+ cond_in_dim: 3
+ cond_dim: 128
+ cond_dim_mults: [ 2, 4 ] # num_down = len(cond_dim_mults)
+ # window_sizes1: [ [4, 4], [2, 2], [1, 1], [1, 1] ]
+ # window_sizes2: [ [4, 4], [2, 2], [1, 1], [1, 1] ]
+ window_sizes1: [ [ 8, 8 ], [ 4, 4 ], [ 2, 2 ], [ 1, 1 ] ]
+ window_sizes2: [ [ 8, 8 ], [ 4, 4 ], [ 2, 2 ], [ 1, 1 ] ]
+ fourier_scale: 16
+ cond_pe: False
+ num_pos_feats: 128
+ cond_feature_size: [ 80, 80 ]
+
+data:
+ name: edge
+ img_folder: '/data/yeyunfan/edge_detection_datasets/datasets/BSDS_test'
+ augment_horizontal_flip: True
+ batch_size: 8
+ num_workers: 4
+
+sampler:
+ sample_type: "slide"
+ stride: [240, 240]
+ batch_size: 1
+ sample_num: 300
+ use_ema: True
+ save_folder:
+ ckpt_path:
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..092fbec839b13e33dffb4b2120e795b8b9a19763
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/__init__.py
@@ -0,0 +1 @@
+# from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.denoising_diffusion_pytorch import GaussianDiffusion, Unet, Trainer
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/data.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de0bc3e00b118b5242e8e04cfa90e799e0c456b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/data.py
@@ -0,0 +1,598 @@
+import torch
+import torchvision.transforms as T
+import torch.utils.data as data
+import torch.nn as nn
+from pathlib import Path
+from functools import partial
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.utils import exists, convert_image_to_fn, normalize_to_neg_one_to_one
+from PIL import Image, ImageDraw
+import torch.nn.functional as F
+import math
+import torchvision.transforms.functional as F2
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from typing import Any, Callable, Optional, Tuple
+import os
+import pickle
+import numpy as np
+import copy
+import custom_albumentations as albumentations
+from torchvision.transforms.functional import InterpolationMode
+
+def get_imgs_list(imgs_dir):
+ imgs_list = os.listdir(imgs_dir)
+ imgs_list.sort()
+ return [os.path.join(imgs_dir, f) for f in imgs_list if f.endswith('.jpg') or f.endswith('.JPG')or f.endswith('.png') or f.endswith('.pgm') or f.endswith('.ppm')]
+
+
+def fit_img_postfix(img_path):
+ if not os.path.exists(img_path) and img_path.endswith(".jpg"):
+ img_path = img_path[:-4] + ".png"
+ if not os.path.exists(img_path) and img_path.endswith(".png"):
+ img_path = img_path[:-4] + ".jpg"
+ return img_path
+
+
+class AdaptEdgeDataset(data.Dataset):
+ def __init__(
+ self,
+ data_root,
+ # mask_folder,
+ image_size,
+ exts = ['png', 'jpg'],
+ augment_horizontal_flip = False,
+ convert_image_to = None,
+ normalize_to_neg_one_to_one=True,
+ split='train',
+ # inter_type='bicubic',
+ # down=4,
+ threshold=0.3, use_uncertainty=False
+ ):
+ super().__init__()
+ # self.img_folder = Path(img_folder)
+ # self.edge_folder = Path(os.path.join(data_root, f'gt_imgs'))
+ # self.img_folder = Path(os.path.join(data_root, f'imgs'))
+ # self.edge_folder = Path(os.path.join(data_root, "edge", "aug"))
+ # self.img_folder = Path(os.path.join(data_root, "image", "aug"))
+ self.data_root = data_root
+ self.image_size = image_size
+
+ # self.edge_paths = [p for ext in exts for p in self.edge_folder.rglob(f'*.{ext}')]
+ # self.img_paths = [(self.img_folder / item.parent.name / f'{item.stem}.jpg') for item in self.edge_paths]
+ # self.img_paths = [(self.img_folder / f'{item.stem}.jpg') for item in self.edge_paths]
+
+ self.threshold = threshold * 256
+ self.use_uncertainty = use_uncertainty
+ self.normalize_to_neg_one_to_one = normalize_to_neg_one_to_one
+
+ maybe_convert_fn = partial(convert_image_to_fn, convert_image_to) if exists(convert_image_to) else Identity()
+
+ # self.normalize_to_neg_one_to_one = normalize_to_neg_one_to_one
+ # self.random_crop = RandomCrop(size=image_size)
+ # self.transform = Compose([
+ # # Lambda(maybe_convert_fn),
+ # # Resize(image_size, interpolation=3, interpolation2=0),
+ # Resize(image_size, interpolation=InterpolationMode.BILINEAR, interpolation2=InterpolationMode.NEAREST),
+ # RandomHorizontalFlip() if augment_horizontal_flip else Identity(),
+ # # RandomCrop(image_size),
+ # ToTensor()
+ # ])
+ self.data_list = self.build_list()
+
+ self.transform = transforms.Compose([
+ # Resize(self.image_size, interpolation=InterpolationMode.BILINEAR, interpolation2=InterpolationMode.NEAREST),
+ transforms.ToTensor()])
+
+ def __len__(self):
+ return len(self.data_list)
+
+
+ def read_img(self, image_path):
+ with open(image_path, 'rb') as f:
+ img = Image.open(f)
+ img = img.convert('RGB')
+
+ raw_width, raw_height = img.size
+ # width = int(raw_width / 32) * 32
+ # height = int(raw_height / 32) * 32
+ # img = img.resize((width, height), Image.Resampling.BILINEAR)
+ # # print("img.size:", img.size)
+ # img = self.transform(img)
+
+ return img, (raw_width, raw_height)
+
+ def read_lb(self, lb_path):
+ lb_data = Image.open(lb_path)
+
+ width, height = lb_data.size
+ width = int(width / 32) * 32
+ height = int(height / 32) * 32
+ lb_data = lb_data.resize((width, height), Image.Resampling.BILINEAR)
+ # print("lb_data.size:", lb_data.size)
+ lb = np.array(lb_data, dtype=np.float32)
+ if lb.ndim == 3:
+ lb = np.squeeze(lb[:, :, 0])
+ assert lb.ndim == 2
+ threshold = self.threshold
+ lb = lb[np.newaxis, :, :]
+
+ lb[lb == 0] = 0
+
+ # ---------- important ----------
+ if self.use_uncertainty:
+ lb[np.logical_and(lb > 0, lb < threshold)] = 2
+ else:
+ lb[np.logical_and(lb > 0, lb < threshold)] /= 255.
+
+ lb[lb >= threshold] = 1
+ return lb
+
+ def build_list(self):
+ data_root = os.path.abspath(self.data_root)
+ images_path = os.path.join(data_root, 'image', "raw")
+ labels_path = os.path.join(data_root, 'edge', "raw")
+
+ samples = []
+ for directory_name in os.listdir(images_path):
+ image_directories = os.path.join(images_path, directory_name)
+ for file_name_ext in os.listdir(image_directories):
+ file_name = os.path.basename(file_name_ext)
+ image_path = fit_img_postfix(os.path.join(images_path, directory_name, file_name))
+ lb_path = fit_img_postfix(os.path.join(labels_path, directory_name, file_name))
+ samples.append((image_path, lb_path))
+ return samples
+
+ def __getitem__(self, index):
+ img_path, edge_path = self.data_list[index]
+ # edge_path = self.edge_paths[index]
+ # img_path = self.img_paths[index]
+ img_name = os.path.basename(img_path)
+
+ img, raw_size = self.read_img(img_path)
+ edge = self.read_lb(edge_path)
+
+ # print("-------hhhhhhhhhhhhh--------:", img.shape, edge.shape)
+ # edge = Image.open(edge_path).convert('L')
+ # # default to score-sde preprocessing
+ # mask = Image.open(img_path).convert('RGB')
+ # edge, img = self.transform(edge, mask)
+ if self.normalize_to_neg_one_to_one: # transform to [-1, 1]
+ edge = normalize_to_neg_one_to_one(edge)
+ img = normalize_to_neg_one_to_one(img)
+ return {'image': edge, 'cond': img, 'raw_size': raw_size, 'img_name': img_name}
+
+class EdgeDataset(data.Dataset):
+ def __init__(
+ self,
+ data_root,
+ # mask_folder,
+ image_size,
+ exts = ['png', 'jpg'],
+ augment_horizontal_flip = True,
+ convert_image_to = None,
+ normalize_to_neg_one_to_one=True,
+ split='train',
+ # inter_type='bicubic',
+ # down=4,
+ threshold=0.3, use_uncertainty=False, cfg={}
+ ):
+ super().__init__()
+ # self.img_folder = Path(img_folder)
+ # self.edge_folder = Path(os.path.join(data_root, f'gt_imgs'))
+ # self.img_folder = Path(os.path.join(data_root, f'imgs'))
+ # self.edge_folder = Path(os.path.join(data_root, "edge", "aug"))
+ # self.img_folder = Path(os.path.join(data_root, "image", "aug"))
+ self.data_root = data_root
+ self.image_size = image_size
+
+ # self.edge_paths = [p for ext in exts for p in self.edge_folder.rglob(f'*.{ext}')]
+ # self.img_paths = [(self.img_folder / item.parent.name / f'{item.stem}.jpg') for item in self.edge_paths]
+ # self.img_paths = [(self.img_folder / f'{item.stem}.jpg') for item in self.edge_paths]
+
+ self.threshold = threshold * 255
+ self.use_uncertainty = use_uncertainty
+ self.normalize_to_neg_one_to_one = normalize_to_neg_one_to_one
+
+ maybe_convert_fn = partial(convert_image_to_fn, convert_image_to) if exists(convert_image_to) else Identity()
+
+ self.data_list = self.build_list()
+
+ # self.transform = Compose([
+ # Resize(image_size),
+ # RandomHorizontalFlip() if augment_horizontal_flip else Identity(),
+ # ToTensor()
+ # ])
+ crop_type = cfg.get('crop_type') if 'crop_type' in cfg else 'rand_crop'
+ if crop_type == 'rand_crop':
+ self.transform = Compose([
+ RandomCrop(image_size),
+ RandomHorizontalFlip() if augment_horizontal_flip else Identity(),
+ ToTensor()
+ ])
+ elif crop_type == 'rand_resize_crop':
+ self.transform = Compose([
+ RandomResizeCrop(image_size),
+ RandomHorizontalFlip() if augment_horizontal_flip else Identity(),
+ ToTensor()
+ ])
+ print("crop_type:", crop_type)
+
+ def __len__(self):
+ return len(self.data_list)
+
+
+ def read_img(self, image_path):
+ with open(image_path, 'rb') as f:
+ img = Image.open(f)
+ img = img.convert('RGB')
+
+ raw_width, raw_height = img.size
+ # width = int(raw_width / 32) * 32
+ # height = int(raw_height / 32) * 32
+ # img = img.resize((width, height), Image.Resampling.BILINEAR)
+ # # print("img.size:", img.size)
+ # img = self.transform(img)
+
+ return img, (raw_width, raw_height)
+
+ def read_lb(self, lb_path):
+ lb_data = Image.open(lb_path).convert('L')
+ lb = np.array(lb_data).astype(np.float32)
+ # width, height = lb_data.size
+ # width = int(width / 32) * 32
+ # height = int(height / 32) * 32
+ # lb_data = lb_data.resize((width, height), Image.Resampling.BILINEAR)
+ # print("lb_data.size:", lb_data.size)
+ # lb = np.array(lb_data, dtype=np.float32)
+ # if lb.ndim == 3:
+ # lb = np.squeeze(lb[:, :, 0])
+ # assert lb.ndim == 2
+ threshold = self.threshold
+ # lb = lb[np.newaxis, :, :]
+ # lb[lb == 0] = 0
+
+ # ---------- important ----------
+ # if self.use_uncertainty:
+ # lb[np.logical_and(lb > 0, lb < threshold)] = 2
+ # else:
+ # lb[np.logical_and(lb > 0, lb < threshold)] /= 255.
+
+ lb[lb >= threshold] = 255
+ lb = Image.fromarray(lb.astype(np.uint8))
+ return lb
+
+ def build_list(self):
+ data_root = os.path.abspath(self.data_root)
+ images_path = os.path.join(data_root, 'image')
+ labels_path = os.path.join(data_root, 'edge')
+
+ samples = []
+ for directory_name in os.listdir(images_path):
+ image_directories = os.path.join(images_path, directory_name)
+ for file_name_ext in os.listdir(image_directories):
+ file_name = os.path.basename(file_name_ext)
+ image_path = fit_img_postfix(os.path.join(images_path, directory_name, file_name))
+ lb_path = fit_img_postfix(os.path.join(labels_path, directory_name, file_name))
+ samples.append((image_path, lb_path))
+ return samples
+
+ def __getitem__(self, index):
+ img_path, edge_path = self.data_list[index]
+ # edge_path = self.edge_paths[index]
+ # img_path = self.img_paths[index]
+ img_name = os.path.basename(img_path)
+
+ img, raw_size = self.read_img(img_path)
+ edge = self.read_lb(edge_path)
+ img, edge = self.transform(img, edge)
+
+ # print("-------hhhhhhhhhhhhh--------:", img.shape, edge.shape)
+ # edge = Image.open(edge_path).convert('L')
+ # # default to score-sde preprocessing
+ # mask = Image.open(img_path).convert('RGB')
+ # edge, img = self.transform(edge, mask)
+ if self.normalize_to_neg_one_to_one: # transform to [-1, 1]
+ edge = normalize_to_neg_one_to_one(edge)
+ img = normalize_to_neg_one_to_one(img)
+ return {'image': edge, 'cond': img, 'raw_size': raw_size, 'img_name': img_name}
+
+class EdgeDatasetTest(data.Dataset):
+ def __init__(
+ self,
+ data_root,
+ # mask_folder,
+ image_size,
+ exts = ['png', 'jpg'],
+ convert_image_to = None,
+ normalize_to_neg_one_to_one=True,
+ ):
+ super().__init__()
+
+ self.data_root = data_root
+ self.image_size = image_size
+ self.normalize_to_neg_one_to_one = normalize_to_neg_one_to_one
+
+ maybe_convert_fn = partial(convert_image_to_fn, convert_image_to) if exists(convert_image_to) else Identity()
+
+ self.data_list = self.build_list()
+
+ self.transform = Compose([
+ ToTensor()
+ ])
+
+ def __len__(self):
+ return len(self.data_list)
+
+
+ def read_img(self, image_path):
+ with open(image_path, 'rb') as f:
+ img = Image.open(f)
+ img = img.convert('RGB')
+
+ raw_width, raw_height = img.size
+
+
+ return img, (raw_width, raw_height)
+
+ def read_lb(self, lb_path):
+ lb_data = Image.open(lb_path).convert('L')
+ lb = np.array(lb_data).astype(np.float32)
+
+ threshold = self.threshold
+
+
+ lb[lb >= threshold] = 255
+ lb = Image.fromarray(lb.astype(np.uint8))
+ return lb
+
+ def build_list(self):
+ data_root = os.path.abspath(self.data_root)
+ # images_path = os.path.join(data_root)
+ images_path = data_root
+ samples = get_imgs_list(images_path)
+ return samples
+
+ def __getitem__(self, index):
+ img_path = self.data_list[index]
+ # edge_path = self.edge_paths[index]
+ # img_path = self.img_paths[index]
+ img_name = os.path.basename(img_path)
+
+ img, raw_size = self.read_img(img_path)
+
+ img = self.transform(img)
+ if self.normalize_to_neg_one_to_one: # transform to [-1, 1]
+ img = normalize_to_neg_one_to_one(img)
+ return {'cond': img, 'raw_size': raw_size, 'img_name': img_name}
+
+
+class Identity(nn.Identity):
+ r"""A placeholder identity operator that is argument-insensitive.
+
+ Args:
+ args: any argument (unused)
+ kwargs: any keyword argument (unused)
+
+ Shape:
+ - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+ - Output: :math:`(*)`, same shape as the input.
+
+ Examples::
+
+ >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
+ >>> input = torch.randn(128, 20)
+ >>> output = m(input)
+ >>> print(output.size())
+ torch.Size([128, 20])
+
+ """
+ def __init__(self, *args, **kwargs):
+ super(Identity, self).__init__(*args, **kwargs)
+
+ def forward(self, input, target):
+ return input, target
+
+class Resize(T.Resize):
+ def __init__(self, size, interpolation2=None, **kwargs):
+ super().__init__(size, **kwargs)
+ if interpolation2 is None:
+ self.interpolation2 = self.interpolation
+ else:
+ self.interpolation2 = interpolation2
+
+ def forward(self, img, target=None):
+ if target is None:
+ img = F2.resize(img, self.size, self.interpolation, self.max_size, self.antialias)
+ return img
+ else:
+ img = F2.resize(img, self.size, self.interpolation, self.max_size, self.antialias)
+ target = F2.resize(target, self.size, self.interpolation2, self.max_size, self.antialias)
+ return img, target
+
+class RandomHorizontalFlip(T.RandomHorizontalFlip):
+ def __init__(self, p=0.5):
+ super().__init__(p)
+
+ def forward(self, img, target=None):
+ if target is None:
+ if torch.rand(1) < self.p:
+ img = F2.hflip(img)
+ return img
+ else:
+ if torch.rand(1) < self.p:
+ img = F2.hflip(img)
+ target = F2.hflip(target)
+ return img, target
+
+class CenterCrop(T.CenterCrop):
+ def __init__(self, size):
+ super().__init__(size)
+
+ def forward(self, img, target=None):
+ if target is None:
+ img = F2.center_crop(img, self.size)
+ return img
+ else:
+ img = F2.center_crop(img, self.size)
+ target = F2.center_crop(target, self.size)
+ return img, target
+
+class RandomCrop(T.RandomCrop):
+ def __init__(self, size, **kwargs):
+ super().__init__(size, **kwargs)
+
+ def single_forward(self, img, i, j, h, w):
+ if self.padding is not None:
+ img = F2.pad(img, self.padding, self.fill, self.padding_mode)
+ width, height = F2.get_image_size(img)
+ # pad the width if needed
+ if self.pad_if_needed and width < self.size[1]:
+ padding = [self.size[1] - width, 0]
+ img = F2.pad(img, padding, self.fill, self.padding_mode)
+ # pad the height if needed
+ if self.pad_if_needed and height < self.size[0]:
+ padding = [0, self.size[0] - height]
+ img = F2.pad(img, padding, self.fill, self.padding_mode)
+
+ return F2.crop(img, i, j, h, w)
+
+ def forward(self, img, target=None):
+ i, j, h, w = self.get_params(img, self.size)
+ if target is None:
+ img = self.single_forward(img, i, j, h, w)
+ return img
+ else:
+ img = self.single_forward(img, i, j, h, w)
+ target = self.single_forward(target, i, j, h, w)
+ return img, target
+
+class RandomResizeCrop(T.RandomResizedCrop):
+ def __init__(self, size, scale=(0.25, 1.0), **kwargs):
+ super().__init__(size, scale, **kwargs)
+
+ # def single_forward(self, img, i, j, h, w):
+ # if self.padding is not None:
+ # img = F2.pad(img, self.padding, self.fill, self.padding_mode)
+ # width, height = F2.get_image_size(img)
+ # # pad the width if needed
+ # if self.pad_if_needed and width < self.size[1]:
+ # padding = [self.size[1] - width, 0]
+ # img = F2.pad(img, padding, self.fill, self.padding_mode)
+ # # pad the height if needed
+ # if self.pad_if_needed and height < self.size[0]:
+ # padding = [0, self.size[0] - height]
+ # img = F2.pad(img, padding, self.fill, self.padding_mode)
+ #
+ # return F2.crop(img, i, j, h, w)
+
+ def single_forward(self, img, i, j, h, w, interpolation=InterpolationMode.BILINEAR):
+ """
+ Args:
+ img (PIL Image or Tensor): Image to be cropped and resized.
+
+ Returns:
+ PIL Image or Tensor: Randomly cropped and resized image.
+ """
+ # i, j, h, w = self.get_params(img, self.scale, self.ratio)
+ return F2.resized_crop(img, i, j, h, w, self.size, interpolation)
+
+ def forward(self, img, target=None):
+ i, j, h, w = self.get_params(img, self.scale, self.ratio)
+ if target is None:
+ img = self.single_forward(img, i, j, h, w)
+ return img
+ else:
+ img = self.single_forward(img, i, j, h, w)
+ target = self.single_forward(target, i, j, h, w, interpolation=InterpolationMode.NEAREST)
+ return img, target
+
+class ToTensor(T.ToTensor):
+ def __init__(self):
+ super().__init__()
+
+ def __call__(self, img, target=None):
+ if target is None:
+ img = F2.to_tensor(img)
+ return img
+ else:
+ img = F2.to_tensor(img)
+ target = F2.to_tensor(target)
+ return img, target
+
+class Lambda(T.Lambda):
+ """Apply a user-defined lambda as a transform. This transform does not support torchscript.
+
+ Args:
+ lambd (function): Lambda/function to be used for transform.
+ """
+
+ def __init__(self, lambd):
+ super().__init__(lambd)
+
+ def __call__(self, img, target=None):
+ if target is None:
+ return self.lambd(img)
+ else:
+ return self.lambd(img), self.lambd(target)
+
+class Compose(T.Compose):
+ def __init__(self, transforms):
+ super().__init__(transforms)
+
+ def __call__(self, img, target=None):
+ if target is None:
+ for t in self.transforms:
+ img = t(img)
+ return img
+ else:
+ for t in self.transforms:
+ img, target = t(img, target)
+ return img, target
+
+
+if __name__ == '__main__':
+ dataset = CIFAR10(
+ img_folder='/media/huang/2da18d46-7cba-4259-9abd-0df819bb104c/data/cifar-10-python',
+ augment_horizontal_flip=False
+ )
+ # dataset = CityscapesDataset(
+ # # img_folder='/media/huang/2da18d46-7cba-4259-9abd-0df819bb104c/data/CelebAHQ/celeba_hq_256',
+ # data_root='/media/huang/2da18d46-7cba-4259-9abd-0df819bb104c/data/Cityscapes/',
+ # # data_root='/media/huang/2da18d46-7cba-4259-9abd-0df819bb104c/data/ADEChallengeData2016/',
+ # image_size=[512, 1024],
+ # exts = ['png'],
+ # augment_horizontal_flip = False,
+ # convert_image_to = None,
+ # normalize_to_neg_one_to_one=True,
+ # )
+ # dataset = SRDataset(
+ # img_folder='/media/huang/ZX3 512G/data/DIV2K/DIV2K_train_HR',
+ # image_size=[512, 512],
+ # )
+ # dataset = InpaintDataset(
+ # img_folder='/media/huang/2da18d46-7cba-4259-9abd-0df819bb104c/data/CelebAHQ/celeba_hq_256',
+ # image_size=[256, 256],
+ # augment_horizontal_flip = True
+ # )
+ dataset = EdgeDataset(
+ data_root='/media/huang/2da18d46-7cba-4259-9abd-0df819bb104c/data/BSDS',
+ image_size=[320, 320],
+ )
+ for i in range(len(dataset)):
+ d = dataset[i]
+ mask = d['cond']
+ print(mask.max())
+ dl = data.DataLoader(dataset, batch_size=2, shuffle=False, pin_memory=True, num_workers=0)
+
+
+ dataset_builder = tfds.builder('cifar10')
+ split = 'train'
+ dataset_options = tf.data.Options()
+ dataset_options.experimental_optimization.map_parallelization = True
+ dataset_options.experimental_threading.private_threadpool_size = 48
+ dataset_options.experimental_threading.max_intra_op_parallelism = 1
+ read_config = tfds.ReadConfig(options=dataset_options)
+ dataset_builder.download_and_prepare()
+ ds = dataset_builder.as_dataset(
+ split=split, shuffle_files=True, read_config=read_config)
+ pause = 0
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/ddm_const_sde.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/ddm_const_sde.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b78d1a79681b88ba272a54e3241b9217cf13e43
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/ddm_const_sde.py
@@ -0,0 +1,992 @@
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_bwd, custom_fwd
+import math
+import torch.nn.functional as F
+# import torchvision.transforms.functional as F2
+from .utils import default, identity, normalize_to_neg_one_to_one, unnormalize_to_zero_to_one
+from tqdm.auto import tqdm
+from einops import rearrange, reduce
+from functools import partial
+from collections import namedtuple
+from random import random, randint, sample, choice
+from .encoder_decoder import DiagonalGaussianDistribution
+import random
+from custom_controlnet_aux.diffusion_edge.taming.modules.losses.vqperceptual import *
+
+# gaussian diffusion trainer class
+ModelPrediction = namedtuple('ModelPrediction', ['pred_noise', 'pred_x_start'])
+
+def extract(a, t, x_shape):
+ b, *_ = t.shape
+ out = a.gather(-1, t)
+ return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+def linear_beta_schedule(timesteps):
+ scale = 1000 / timesteps
+ beta_start = scale * 0.0001
+ beta_end = scale * 0.02
+ return torch.linspace(beta_start, beta_end, timesteps, dtype = torch.float64)
+
+def cosine_beta_schedule(timesteps, s = 0.008):
+ """
+ cosine schedule
+ as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+ """
+ steps = timesteps + 1
+ x = torch.linspace(0, timesteps, steps, dtype = torch.float64)
+ alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
+ alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+ betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+ return torch.clip(betas, 0, 0.999)
+
+class DDPM(nn.Module):
+ def __init__(
+ self,
+ model,
+ *,
+ image_size,
+ timesteps = 1000,
+ sampling_timesteps = None,
+ loss_type = 'l2',
+ objective = 'pred_noise',
+ beta_schedule = 'cosine',
+ p2_loss_weight_gamma = 0., # p2 loss weight, from https://arxiv.org/abs/2204.00227 - 0 is equivalent to weight of 1 across time - 1. is recommended
+ p2_loss_weight_k = 1,
+ original_elbo_weight=0.,
+ ddim_sampling_eta = 1.,
+ clip_x_start=True,
+ train_sample=-1,
+ input_keys=['image'],
+ start_dist='normal',
+ sample_type='ddim',
+ perceptual_weight=1.,
+ use_l1=False,
+ **kwargs
+ ):
+ ckpt_path = kwargs.pop("ckpt_path", None)
+ ignore_keys = kwargs.pop("ignore_keys", [])
+ only_model = kwargs.pop("only_model", False)
+ cfg = kwargs.pop("cfg", None)
+ super().__init__(**kwargs)
+ # assert not (type(self) == DDPM and model.channels != model.out_dim)
+ # assert not model.random_or_learned_sinusoidal_cond
+
+ self.model = model
+ self.channels = self.model.channels
+ self.self_condition = self.model.self_condition
+ self.input_keys = input_keys
+ self.cfg = cfg
+ self.eps = cfg.get('eps', 1e-4) if cfg is not None else 1e-4
+ self.weighting_loss = cfg.get("weighting_loss", False) if cfg is not None else False
+ if self.weighting_loss:
+ print('#### WEIGHTING LOSS ####')
+
+ self.clip_x_start = clip_x_start
+ self.image_size = image_size
+ self.train_sample = train_sample
+ self.objective = objective
+ self.start_dist = start_dist
+ assert start_dist in ['normal', 'uniform']
+
+ assert objective in {'pred_noise', 'pred_x0', 'pred_v', 'pred_delta', 'pred_KC'}, 'objective must be either pred_noise (predict noise) or pred_x0 (predict image start) or pred_v (predict v [v-parameterization as defined in appendix D of progressive distillation paper, used in imagen-video successfully])'
+
+ if beta_schedule == 'linear':
+ betas = linear_beta_schedule(timesteps)
+ elif beta_schedule == 'cosine':
+ betas = cosine_beta_schedule(timesteps, s=1e-4)
+ else:
+ raise ValueError(f'unknown beta schedule {beta_schedule}')
+ # betas[0] = 2e-3 * betas[0]
+ alphas = 1. - betas
+ alphas_cumprod = torch.cumprod(alphas, dim=0)
+ alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
+
+ timesteps, = betas.shape
+ self.num_timesteps = int(timesteps)
+ self.time_range = list(range(self.num_timesteps + 1))
+ self.loss_type = loss_type
+ self.original_elbo_weight = original_elbo_weight
+
+ # sampling related parameters
+
+ self.sampling_timesteps = default(sampling_timesteps, timesteps) # default num sampling timesteps to number of timesteps at training
+
+ # assert self.sampling_timesteps <= timesteps
+ self.is_ddim_sampling = self.sampling_timesteps < timesteps
+ self.ddim_sampling_eta = ddim_sampling_eta
+
+ # helper function to register buffer from float64 to float32
+
+ register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32))
+
+ register_buffer('betas', betas)
+ register_buffer('alphas_cumprod', alphas_cumprod)
+ register_buffer('alphas_cumprod_prev', alphas_cumprod_prev)
+
+ # calculations for diffusion q(x_t | x_{t-1}) and others
+
+ register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+ register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
+ register_buffer('log_one_minus_alphas_cumprod', torch.log(1. - alphas_cumprod))
+ register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
+ register_buffer('sqrt_recipm1_alphas_cumprod', torch.sqrt(1. / alphas_cumprod - 1))
+
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
+
+ posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+
+ # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+
+ register_buffer('posterior_variance', posterior_variance)
+
+ # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+
+ register_buffer('posterior_log_variance_clipped', torch.log(posterior_variance.clamp(min =1e-20)))
+ register_buffer('posterior_mean_coef1', betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))
+ register_buffer('posterior_mean_coef2', (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / (1. - alphas_cumprod))
+
+ # calculate p2 reweighting
+
+ register_buffer('p2_loss_weight', (p2_loss_weight_k + alphas_cumprod / (1 - alphas_cumprod)) ** -p2_loss_weight_gamma)
+ assert not torch.isnan(self.p2_loss_weight).all()
+ if self.objective == "pred_noise":
+ lvlb_weights = self.betas ** 2 / (
+ 2 * (self.posterior_variance+1e-5) * alphas * (1 - self.alphas_cumprod))
+ elif self.objective == "pred_x0":
+ lvlb_weights = 0.5 * torch.sqrt(alphas_cumprod) / (2. * 1 - alphas_cumprod)
+ elif self.objective == "pred_delta":
+ lvlb_weights = 0.5 * torch.sqrt(alphas_cumprod) / (2. * 1 - alphas_cumprod)
+ elif self.objective == "pred_KC":
+ lvlb_weights = 0.5 * torch.sqrt(alphas_cumprod) / (2. * 1 - alphas_cumprod)
+ elif self.objective == "pred_v":
+ lvlb_weights = 0.5 * torch.sqrt(alphas_cumprod) / (2. * 1 - alphas_cumprod)
+ self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
+ assert not torch.isnan(self.lvlb_weights).all()
+ self.use_l1 = use_l1
+
+ self.perceptual_weight = perceptual_weight
+ if self.perceptual_weight > 0:
+ self.perceptual_loss = LPIPS().eval()
+
+ if ckpt_path is not None:
+ self.init_from_ckpt(ckpt_path, ignore_keys, only_model)
+
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False, use_ema=False):
+ sd = torch.load(path, map_location="cpu")
+ if 'ema' in list(sd.keys()) and use_ema:
+ sd = sd['ema']
+ new_sd = {}
+ for k in sd.keys():
+ if k.startswith("ema_model."):
+ new_k = k[10:] # remove ema_model.
+ new_sd[new_k] = sd[k]
+ sd = new_sd
+ else:
+ if "model" in list(sd.keys()):
+ sd = sd["model"]
+ keys = list(sd.keys())
+ for k in keys:
+ for ik in ignore_keys:
+ if k.startswith(ik):
+ print("Deleting key {} from state_dict.".format(k))
+ del sd[k]
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
+ sd, strict=False)
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+ if len(missing) > 0:
+ print(f"Missing Keys: {missing}")
+ if len(unexpected) > 0:
+ print(f"Unexpected Keys: {unexpected}")
+
+ @torch.no_grad()
+ def p_sample(self, x, mask, t: int, x_self_cond = None, clip_denoised = True):
+ b, *_, device = *x.shape, x.device
+ batched_times = torch.full((x.shape[0],), t, device = x.device, dtype = torch.long)
+ model_mean, _, model_log_variance, x_start = self.p_mean_variance(x = x, mask=mask, t = batched_times, x_self_cond = x_self_cond, clip_denoised = clip_denoised)
+ noise = torch.randn_like(x) if t > 0 else 0. # no noise if t == 0
+ pred_img = model_mean + (0.5 * model_log_variance).exp() * noise
+ return pred_img, x_start
+
+ @torch.no_grad()
+ def p_sample_loop(self, shape, mask, up_scale=1, unnormalize=True):
+ batch, device = shape[0], self.betas.device
+
+ img = torch.randn(shape, device=device)
+ img = F.interpolate(img, scale_factor=up_scale, mode='bilinear', align_corners=True)
+
+ x_start = None
+
+ for t in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps):
+ self_cond = x_start if self.self_condition else None
+ img, x_start = self.p_sample(img, mask, t, self_cond)
+ if unnormalize:
+ img = unnormalize_to_zero_to_one(img)
+ return img
+
+ @torch.no_grad()
+ def ddim_sample(self, shape, mask, up_scale=1, unnormalize=True):
+ batch, device, total_timesteps, sampling_timesteps, eta, objective = shape[0], self.betas.device, self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
+
+ times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1) # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+ times = list(reversed(times.int().tolist()))
+ time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+
+ img = torch.randn(shape, device = device)
+ img = F.interpolate(img, scale_factor=up_scale, mode='bilinear', align_corners=True)
+
+ x_start = None
+
+ for time, time_next in tqdm(time_pairs, desc = 'sampling loop time step', total=len(time_pairs)):
+ time_cond = torch.full((batch,), time, device=device, dtype=torch.long)
+ self_cond = x_start if self.self_condition else None
+ pred_noise, x_start, *_ = self.model_predictions(img, time_cond, mask, self_cond)
+
+ if time_next < 0:
+ img = x_start
+ continue
+
+ alpha = self.alphas_cumprod[time]
+ alpha_next = self.alphas_cumprod[time_next]
+
+ sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+ c = (1 - alpha_next - sigma ** 2).sqrt()
+
+ noise = torch.randn_like(img)
+
+ img = x_start * alpha_next.sqrt() + \
+ c * pred_noise + \
+ sigma * noise
+ if unnormalize:
+ img = unnormalize_to_zero_to_one(img)
+ return img
+
+
+ @torch.no_grad()
+ def interpolate(self, x1, x2, mask, t = None, lam = 0.5):
+ b, *_, device = *x1.shape, x1.device
+ t = default(t, self.num_timesteps - 1)
+
+ assert x1.shape == x2.shape
+
+ t_batched = torch.stack([torch.tensor(t, device = device)] * b)
+ xt1, xt2 = map(lambda x: self.q_sample(x, t = t_batched), (x1, x2))
+
+ img = (1 - lam) * xt1 + lam * xt2
+ for i in tqdm(reversed(range(0, t)), desc = 'interpolation sample time step', total = t):
+ img = self.p_sample(img, mask, torch.full((b,), i, device=device, dtype=torch.long))
+ return img
+
+ def get_input(self, batch, return_first_stage_outputs=False, return_original_cond=False):
+ assert 'image' in self.input_keys;
+ if len(self.input_keys) > len(batch.keys()):
+ x, *_ = batch.values()
+ else:
+ x = batch.values()
+ return x
+
+ def training_step(self, batch):
+ z, *_ = self.get_input(batch)
+ cond = batch['cond'] if 'cond' in batch else None
+ loss, loss_dict = self(z, cond)
+ return loss, loss_dict
+
+ def forward(self, x, *args, **kwargs):
+ # continuous time, t in [0, 1]
+ # t = []
+ # for _ in range(x.shape[0]):
+ # if self.train_sample <= 0:
+ # t.append(torch.tensor(sample(self.time_range, 2), device=x.device).long())
+ # else:
+ # sl = choice(self.time_range)
+ # sl_range = list(range(sl - self.train_sample, sl + self.train_sample))
+ # sl_range = list(set(sl_range) & set(self.time_range))
+ # sl_range.pop(sl_range.index(sl))
+ # sl2 = choice(sl_range)
+ # t.append(torch.tensor([sl, sl2], device=x.device).long())
+ # t = torch.stack(t, dim=0)
+ # t = torch.randint(0, self.num_timesteps+1, (x.shape[0],), device=x.device).long()
+ eps = self.eps # smallest time step
+ # t = torch.rand((x.shape[0],), device=x.device) * (self.num_timesteps / eps)
+ # t = t.round() * eps
+ # t[t < eps] = eps
+ t = torch.rand(x.shape[0], device=x.device) * (1. - eps) + eps
+ return self.p_losses(x, t, *args, **kwargs)
+
+ def q_sample2(self, x_start, t, noise=None):
+ b, c, h, w = x_start.shape
+ noise = default(noise, lambda: torch.randn_like(x_start))
+ _, nt = t.shape
+ param_x = self.sqrt_alphas_cumprod.repeat(b, 1).gather(-1, t) # (b, nt)
+ x = x_start.expand(nt, b, c, h, w).transpose(1, 0) * param_x.reshape(b, nt, 1, 1, 1).repeat(1, 1, c, h, w)
+ param_noise = self.sqrt_one_minus_alphas_cumprod.repeat(b, 1).gather(-1, t)
+ n = noise.expand(nt, b, c, h, w).transpose(1, 0) * param_noise.reshape(b, nt, 1, 1, 1).repeat(1, 1, c, h, w)
+ return x + n # (b, nt, c, h, w)
+
+ def q_sample3(self, x_start, t, C):
+ b, c, h, w = x_start.shape
+ _, nt = t.shape
+ # K_ = K.unsqueeze(1).repeat(1, nt, 1, 1, 1)
+ C_ = C.unsqueeze(1).repeat(1, nt, 1, 1, 1)
+ x_noisy = x_start.expand(nt, b, c, h, w).transpose(1, 0) + \
+ + C_ * t.reshape(b, nt, 1, 1, 1).repeat(1, 1, c, h, w) / self.num_timesteps
+ return x_noisy # (b, nt, c, h, w)
+
+ # def q_sample(self, x_start, t, C):
+ # x_noisy = x_start + C * t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1))) / self.num_timesteps
+ # return x_noisy
+ def q_sample(self, x_start, noise, t, C):
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ x_noisy = x_start + C * time + torch.sqrt(time) * noise
+ return x_noisy
+
+ def q_sample2(self, x_start, noise, t, C):
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ x_noisy = x_start + C / 2 * time ** 2 + torch.sqrt(time) * noise
+ return x_noisy
+
+ def pred_x0_from_xt(self, xt, noise, C, t):
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ x0 = xt - C * time - torch.sqrt(time) * noise
+ return x0
+
+ def pred_x0_from_xt2(self, xt, noise, C, t):
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ x0 = xt - C / 2 * time ** 2 - torch.sqrt(time) * noise
+ return x0
+
+ def pred_xtms_from_xt(self, xt, noise, C, t, s):
+ # noise = noise / noise.std(dim=[1, 2, 3]).reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ s = s.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ mean = xt + C * (time-s) - C * time - s / torch.sqrt(time) * noise
+ epsilon = torch.randn_like(mean, device=xt.device)
+ sigma = torch.sqrt(s * (time-s) / time)
+ xtms = mean + sigma * epsilon
+ return xtms
+
+ def pred_xtms_from_xt2(self, xt, noise, C, t, s):
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ s = s.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ mean = xt + C / 2 * (time-s) ** 2 - C / 2 * time ** 2 - s / torch.sqrt(time) * noise
+ epsilon = torch.randn_like(mean, device=xt.device)
+ sigma = torch.sqrt(s * (time-s) / time)
+ xtms = mean + sigma * epsilon
+ return xtms
+
+ def WCE_loss(self, prediction, labelf, beta=1.1):
+ label = labelf.long()
+ mask = labelf.clone()
+
+ num_positive = torch.sum(label == 1).float()
+ num_negative = torch.sum(label == 0).float()
+
+ mask[label == 1] = 1.0 * num_negative / (num_positive + num_negative)
+ mask[label == 0] = beta * num_positive / (num_positive + num_negative)
+ mask[label == 2] = 0
+ cost = F.binary_cross_entropy(prediction, labelf, weight=mask, reduction='sum')
+
+ return cost
+
+ def Dice_Loss(self, pred, label):
+ # pred = torch.sigmoid(pred)
+ smooth = 1
+ pred_flat = pred.view(-1)
+ label_flat = label.view(-1)
+
+ intersecion = pred_flat * label_flat
+ unionsection = pred_flat.pow(2).sum() + label_flat.pow(2).sum() + smooth
+ loss = unionsection / (2 * intersecion.sum() + smooth)
+ loss = loss.sum()
+ return loss
+
+ def p_losses(self, x_start, t, *args, **kwargs):
+ if self.start_dist == 'normal':
+ noise = torch.randn_like(x_start)
+ elif self.start_dist == 'uniform':
+ noise = 2 * torch.rand_like(x_start) - 1.
+ else:
+ raise NotImplementedError(f'{self.start_dist} is not supported !')
+ # K = -1. * torch.ones_like(x_start)
+ # C = noise - x_start # t = 1000 / 1000
+ C = -1 * x_start # U(t) = Ct, U(1) = -x0
+ # C = -2 * x_start # U(t) = 1/2 * C * t**2, U(1) = 1/2 * C = -x0
+ x_noisy = self.q_sample(x_start=x_start, noise=noise, t=t, C=C) # (b, 2, c, h, w)
+ C_pred, noise_pred = self.model(x_noisy, t, **kwargs)
+ # C_pred = C_pred / torch.sqrt(t)
+ # noise_pred = noise_pred / torch.sqrt(1 - t)
+ x_rec = self.pred_x0_from_xt(x_noisy, noise_pred, C_pred, t) # x_rec:(B, 1, H, W)
+ loss_dict = {}
+ prefix = 'train'
+
+ # elif self.objective == 'pred_KC':
+ # target1 = C
+ # target2 = noise
+ # target3 = x_start
+
+ target1 = C
+ target2 = noise
+ target3 = x_start
+
+ loss_simple = 0.
+ loss_vlb = 0.
+ # use l1 + l2
+ if self.weighting_loss:
+ simple_weight1 = 2*torch.exp(1-t)
+ simple_weight2 = torch.exp(torch.sqrt(t))
+ if self.cfg.model_name == 'ncsnpp9':
+ simple_weight1 = (t + 1) / t.sqrt()
+ simple_weight2 = (2 - t).sqrt() / (1 - t + self.eps).sqrt()
+ else:
+ simple_weight1 = 1
+ simple_weight2 = 1
+
+ loss_simple += simple_weight1 * self.get_loss(C_pred, target1, mean=False).mean([1, 2, 3]) + \
+ simple_weight2 * self.get_loss(noise_pred, target2, mean=False).mean([1, 2, 3])
+ if self.use_l1:
+ loss_simple += simple_weight1 * (C_pred - target1).abs().mean([1, 2, 3]) + \
+ simple_weight2 * (noise_pred - target2).abs().mean([1, 2, 3])
+ loss_simple = loss_simple / 2
+ # rec_weight = (1 - t.reshape(C.shape[0], 1)) ** 2
+ rec_weight = 1 - t.reshape(C.shape[0], 1) # (B, 1)
+ loss_simple = loss_simple.mean()
+ loss_dict.update({f'{prefix}/loss_simple': loss_simple})
+
+ # loss_vlb += torch.abs(x_rec - target3).mean([1, 2, 3]) * rec_weight: (B, 1)
+ loss_vlb += self.Dice_Loss(x_rec, target3)
+ loss_vlb = loss_vlb.mean()
+ loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
+
+ loss = loss_simple + loss_vlb
+ loss_dict.update({f'{prefix}/loss': loss})
+
+ return loss, loss_dict
+
+ def get_loss(self, pred, target, mean=True):
+ if self.loss_type == 'l1':
+ loss = (target - pred).abs()
+ if mean:
+ loss = loss.mean()
+ elif self.loss_type == 'l2':
+ if mean:
+ loss = torch.nn.functional.mse_loss(target, pred)
+ else:
+ loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+ else:
+ raise NotImplementedError("unknown loss type '{loss_type}'")
+
+ return loss
+
+ @torch.no_grad()
+ def sample(self, batch_size=16, up_scale=1, cond=None, denoise=True):
+ image_size, channels = self.image_size, self.channels
+ if cond is not None:
+ batch_size = cond.shape[0]
+ return self.sample_fn((batch_size, channels, image_size[0], image_size[1]),
+ up_scale=up_scale, unnormalize=True, cond=cond, denoise=denoise)
+
+ @torch.no_grad()
+ def sample_fn(self, shape, up_scale=1, unnormalize=True, cond=None, denoise=False):
+ batch, device, total_timesteps, sampling_timesteps, eta, objective = shape[0], \
+ self.betas.device, self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
+
+ # times = torch.linspace(-1, total_timesteps, steps=self.sampling_timesteps + 1).int()
+ # times = list(reversed(times.int().tolist()))
+ # time_pairs = list(zip(times[:-1], times[1:]))
+ # time_steps = torch.tensor([0.25, 0.15, 0.1, 0.1, 0.1, 0.09, 0.075, 0.06, 0.045, 0.03])
+ step = 1. / self.sampling_timesteps
+ # time_steps = torch.tensor([0.1]).repeat(10)
+ time_steps = torch.tensor([step]).repeat(self.sampling_timesteps)
+ if denoise:
+ eps = self.eps
+ time_steps = torch.cat((time_steps[:-1], torch.tensor([step - eps]), torch.tensor([eps])), dim=0)
+
+ if self.start_dist == 'normal':
+ img = torch.randn(shape, device=device)
+ elif self.start_dist == 'uniform':
+ img = 2 * torch.rand(shape, device=device) - 1.
+ else:
+ raise NotImplementedError(f'{self.start_dist} is not supported !')
+ img = F.interpolate(img, scale_factor=up_scale, mode='bilinear', align_corners=True)
+ # K = -1 * torch.ones_like(img)
+ cur_time = torch.ones((batch,), device=device)
+ for i, time_step in enumerate(time_steps):
+ s = torch.full((batch,), time_step, device=device)
+ if i == time_steps.shape[0] - 1:
+ s = cur_time
+ if cond is not None:
+ pred = self.model(img, cur_time, cond)
+ else:
+ pred = self.model(img, cur_time)
+ # C, noise = pred.chunk(2, dim=1)
+ C, noise = pred[:2]
+ # correct C
+ x0 = self.pred_x0_from_xt(img, noise, C, cur_time)
+ if self.clip_x_start:
+ x0.clamp_(-1., 1.)
+ # C.clamp_(-2., 2.)
+ C = -1 * x0
+ img = self.pred_xtms_from_xt(img, noise, C, cur_time, s)
+ # img = self.pred_xtms_from_xt2(img, noise, C, cur_time, s)
+ cur_time = cur_time - s
+ img.clamp_(-1., 1.)
+ if unnormalize:
+ img = unnormalize_to_zero_to_one(img)
+ return img
+
+
+
+class LatentDiffusion(DDPM):
+ def __init__(self,
+ auto_encoder,
+ scale_factor=1.0,
+ scale_by_std=True,
+ scale_by_softsign=False,
+ input_keys=['image'],
+ sample_type='ddim',
+ num_timesteps_cond=1,
+ train_sample=-1,
+ default_scale=False,
+ *args,
+ **kwargs
+ ):
+ self.scale_by_std = scale_by_std
+ self.scale_by_softsign = scale_by_softsign
+ self.default_scale = default_scale
+ self.num_timesteps_cond = num_timesteps_cond
+ self.train_sample = train_sample
+ self.perceptual_weight = 0
+ ckpt_path = kwargs.pop("ckpt_path", None)
+ ignore_keys = kwargs.pop("ignore_keys", [])
+ only_model = kwargs.pop("only_model", False)
+ super().__init__(*args, **kwargs)
+ assert self.num_timesteps_cond <= self.num_timesteps
+ if not scale_by_std:
+ self.scale_factor = scale_factor
+ else:
+ self.register_buffer('scale_factor', torch.tensor(scale_factor))
+ if self.scale_by_softsign:
+ self.scale_by_std = False
+ print('### USING SOFTSIGN RESCALING')
+ assert (self.scale_by_std and self.scale_by_softsign) is False;
+
+ self.init_first_stage(auto_encoder)
+ # self.instantiate_cond_stage(cond_stage_config)
+ self.input_keys = input_keys
+ self.clip_denoised = False
+ assert sample_type in ['p_loop', 'ddim', 'dpm', 'transformer'] ### 'dpm' is not availible now, suggestion 'ddim'
+ self.sample_type = sample_type
+
+ if ckpt_path is not None:
+ self.init_from_ckpt(ckpt_path, ignore_keys, only_model)
+
+ def init_first_stage(self, first_stage_model):
+ self.first_stage_model = first_stage_model.eval()
+ # self.first_stage_model.train = disabled_train
+ for param in self.first_stage_model.parameters():
+ param.requires_grad = False
+
+ '''
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+ sd = torch.load(path, map_location="cpu")
+ if "state_dict" in list(sd.keys()):
+ sd = sd["state_dict"]
+ keys = list(sd.keys())
+ for k in keys:
+ for ik in ignore_keys:
+ if k.startswith(ik):
+ print("Deleting key {} from state_dict.".format(k))
+ del sd[k]
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
+ sd, strict=False)
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+ if len(missing) > 0:
+ print(f"Missing Keys: {missing}")
+ if len(unexpected) > 0:
+ print(f"Unexpected Keys: {unexpected}")
+ '''
+
+ def get_first_stage_encoding(self, encoder_posterior):
+ if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+ z = encoder_posterior.sample()
+ elif isinstance(encoder_posterior, torch.Tensor):
+ z = encoder_posterior
+ else:
+ raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
+ # return self.scale_factor * z.detach() + self.scale_bias
+ return z.detach()
+
+ @torch.no_grad()
+ def on_train_batch_start(self, batch):
+ # only for the first batch
+ if self.scale_by_std and (not self.scale_by_softsign):
+ if not self.default_scale:
+ assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
+ # set rescale weight to 1./std of encodings
+ print("### USING STD-RESCALING ###")
+ x, *_ = batch.values()
+ encoder_posterior = self.first_stage_model.encode(x)
+ z = self.get_first_stage_encoding(encoder_posterior)
+ del self.scale_factor
+ self.register_buffer('scale_factor', 1. / z.flatten().std())
+ print(f"setting self.scale_factor to {self.scale_factor}")
+ # print("### USING STD-RESCALING ###")
+ else:
+ print(f'### USING DEFAULT SCALE {self.scale_factor}')
+ else:
+ print(f'### USING SOFTSIGN SCALE !')
+
+ @torch.no_grad()
+ def get_input(self, batch, return_first_stage_outputs=False, return_original_cond=False):
+ assert 'image' in self.input_keys;
+ # if len(self.input_keys) > len(batch.keys()):
+ # x, cond, *_ = batch.values()
+ # else:
+ # x, cond = batch.values()
+ x = batch['image']
+ cond = batch['cond'] if 'cond' in batch else None
+ z = self.first_stage_model.encode(x)
+ # print('zzzz', z.shape)
+ z = self.get_first_stage_encoding(z)
+ out = [z, cond, x]
+ if return_first_stage_outputs:
+ xrec = self.first_stage_model.decode(z)
+ out.extend([x, xrec])
+ if return_original_cond:
+ out.append(cond)
+ return out
+
+ def training_step(self, batch):
+ z, c, *_ = self.get_input(batch)
+ # print(_[0].shape)
+ if self.scale_by_softsign:
+ z = F.softsign(z)
+ elif self.scale_by_std:
+ z = self.scale_factor * z
+ # print('grad', self.scale_bias.grad)
+ loss, loss_dict = self(z, c, edge=_[0])
+ return loss, loss_dict
+
+ def q_sample3(self, x_start, t, K, C):
+ b, c, h, w = x_start.shape
+ _, nt = t.shape
+ K_ = K.unsqueeze(1).repeat(1, nt, 1, 1, 1)
+ C_ = C.unsqueeze(1).repeat(1, nt, 1, 1, 1)
+ x_noisy = x_start.expand(nt, b, c, h, w).transpose(1, 0) + K_ / 2 * (t.reshape(b, nt, 1, 1, 1).repeat(1, 1, c, h, w) / self.num_timesteps) ** 2 \
+ + C_ * t.reshape(b, nt, 1, 1, 1).repeat(1, 1, c, h, w) / self.num_timesteps
+ return x_noisy # (b, nt, c, h, w)
+
+ def pred_xtms_from_xt(self, xt, noise, C, t, s):
+ # noise = noise / noise.std(dim=[1, 2, 3]).reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ time = t.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ s = s.reshape(C.shape[0], *((1,) * (len(C.shape) - 1)))
+ mean = xt - C * s - s / torch.sqrt(time) * noise
+ epsilon = torch.randn_like(mean, device=xt.device)
+ sigma = torch.sqrt(s * (time-s) / time)
+ xtms = mean + sigma * epsilon
+ return xtms
+
+ def WCE_loss(self, prediction, labelf, beta=1.1):
+ label = labelf.long()
+ mask = labelf.clone()
+
+ num_positive = torch.sum(label == 1).float()
+ num_negative = torch.sum(label == 0).float()
+
+ mask[label == 1] = 1.0 * num_negative / (num_positive + num_negative)
+ mask[label == 0] = beta * num_positive / (num_positive + num_negative)
+ mask[label == 2] = 0
+ cost = F.binary_cross_entropy(prediction, labelf, weight=mask, reduction='sum')
+
+ return cost
+
+ def Dice_Loss(self, pred, label):
+ # pred = torch.sigmoid(pred)
+ B = pred.shape[0]
+ smooth = 1
+ pred_flat = pred.view(B, -1)
+ label_flat = label.view(B, -1)
+
+ intersecion = pred_flat * label_flat
+ unionsection = pred_flat.pow(2).sum(dim=-1) + label_flat.pow(2).sum(dim=-1) + smooth
+ loss = unionsection / (2 * intersecion.sum(dim=-1) + smooth)
+ loss = loss.reshape(B, 1)
+ return loss
+
+ def p_losses(self, x_start, t, *args, **kwargs):
+ if self.start_dist == 'normal':
+ noise = torch.randn_like(x_start)
+ elif self.start_dist == 'uniform':
+ noise = 2 * torch.rand_like(x_start) - 1.
+ else:
+ raise NotImplementedError(f'{self.start_dist} is not supported !')
+ # K = -1. * torch.ones_like(x_start)
+ # C = noise - x_start # t = 1000 / 1000
+ C = -1 * x_start # U(t) = Ct, U(1) = -x0
+ # C = -2 * x_start # U(t) = 1/2 * C * t**2, U(1) = 1/2 * C = -x0
+ x_noisy = self.q_sample(x_start=x_start, noise=noise, t=t, C=C) # (b, 2, c, h, w)
+ if self.cfg.model_name == 'cond_unet8':
+ C_pred, noise_pred, (e1, e2) = self.model(x_noisy, t, *args, **kwargs)
+ if self.cfg.model_name == 'cond_unet13':
+ C_pred, noise_pred, aux_C = self.model(x_noisy, t, *args, **kwargs)
+ else:
+ C_pred, noise_pred = self.model(x_noisy, t, *args, **kwargs)
+ # C_pred = C_pred / torch.sqrt(t)
+ # noise_pred = noise_pred / torch.sqrt(1 - t)
+ x_rec = self.pred_x0_from_xt(x_noisy, noise_pred, C_pred, t) # x_rec:(B, C, H, W)
+ loss_dict = {}
+ prefix = 'train'
+
+ # elif self.objective == 'pred_KC':
+ # target1 = C
+ # target2 = noise
+ # target3 = x_start
+
+ target1 = C
+ target2 = noise
+ target3 = x_start
+
+ loss_simple = 0.
+ loss_vlb = 0.
+
+ simple_weight1 = (t + 1) / t.sqrt()
+ simple_weight2 = (2 - t).sqrt() / (1 - t + self.eps).sqrt()
+
+ # if self.weighting_loss:
+ # simple_weight1 = 2 * torch.exp(1 - t)
+ # simple_weight2 = torch.exp(torch.sqrt(t))
+ # if self.cfg.model_name == 'ncsnpp9':
+ # simple_weight1 = (t + 1) / t.sqrt()
+ # simple_weight2 = (2 - t).sqrt() / (1 - t + self.eps).sqrt()
+ # else:
+ # simple_weight1 = 1
+ # simple_weight2 = 1
+
+ loss_simple += simple_weight1 * self.get_loss(C_pred, target1, mean=False).mean([1, 2, 3]) + \
+ simple_weight2 * self.get_loss(noise_pred, target2, mean=False).mean([1, 2, 3])
+
+ # loss_simple += self.Dice_Loss(C_pred, target1) * simple_weight1
+
+ if self.use_l1:
+ loss_simple += simple_weight1 * (C_pred - target1).abs().mean([1, 2, 3]) + \
+ simple_weight2 * (noise_pred - target2).abs().mean([1, 2, 3])
+ loss_simple = loss_simple / 2
+
+ if self.cfg.model_name == 'cond_unet8':
+ loss_simple += 0.05*(self.Dice_Loss(e1, (kwargs['edge'] + 1)/2) + self.Dice_Loss(e2, (kwargs['edge'] + 1)/2))
+ elif self.cfg.model_name == 'cond_unet13':
+ loss_simple += 0.5 * (simple_weight1 * self.get_loss(aux_C, target1, mean=False).mean([1, 2, 3]) + \
+ simple_weight1 * (aux_C - target1).abs().mean([1, 2, 3]))
+
+ rec_weight = (1 - t.reshape(C.shape[0], 1)) ** 2
+ # rec_weight = 1 - t.reshape(C.shape[0], 1) # (B, 1)
+ loss_simple = loss_simple.mean()
+ loss_dict.update({f'{prefix}/loss_simple': loss_simple})
+
+ loss_vlb += torch.abs(x_rec - target3).mean([1, 2, 3]) * rec_weight # : (B, 1)
+ # loss_vlb += self.Dice_Loss(x_rec, target3) * rec_weight
+
+ # loss_vlb = loss_vlb
+ loss_vlb = loss_vlb.mean()
+
+ if self.cfg.get('use_disloss', False):
+ with torch.no_grad():
+ edge_rec = self.first_stage_model.decode(x_rec / self.scale_factor)
+ edge_rec = unnormalize_to_zero_to_one(edge_rec)
+ edge_rec = torch.clamp(edge_rec, min=0., max=1.) # B, 1, 320, 320
+ loss_tmp = self.cross_entropy_loss_RCF(edge_rec, (kwargs['edge'] + 1)/2) * rec_weight # B, 1
+ loss_ce = SpecifyGradient.apply(x_rec, loss_tmp.mean())
+ # print(loss_ce.shape)
+ # print(loss_vlb.shape)
+ loss_vlb += loss_ce.mean()
+ loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
+
+ loss = loss_simple + loss_vlb
+ loss_dict.update({f'{prefix}/loss': loss})
+
+ return loss, loss_dict
+
+ def get_loss(self, pred, target, mean=True):
+ if self.loss_type == 'l1':
+ loss = (target - pred).abs()
+ if mean:
+ loss = loss.mean()
+ elif self.loss_type == 'l2':
+ if mean:
+ loss = torch.nn.functional.mse_loss(target, pred)
+ else:
+ loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+ else:
+ raise NotImplementedError("unknown loss type '{loss_type}'")
+
+ return loss
+
+ def cross_entropy_loss_RCF(self, prediction, labelf, beta=1.1):
+ # label = labelf.long()
+ label = labelf
+ mask = labelf.clone()
+
+ num_positive = torch.sum(label == 1).float()
+ num_negative = torch.sum(label == 0).float()
+
+ mask_temp = (label > 0) & (label <= 0.3)
+ mask[mask_temp] = 0.
+
+ mask[label == 1] = 1.0 * num_negative / (num_positive + num_negative)
+ mask[label == 0] = beta * num_positive / (num_positive + num_negative)
+
+ # mask[label == 2] = 0
+ cost = F.binary_cross_entropy(prediction, labelf, weight=mask, reduction='none')
+ return cost.mean([1, 2, 3])
+
+ @torch.no_grad()
+ def sample(self, batch_size=16, up_scale=1, cond=None, mask=None, denoise=True):
+ # image_size, channels = self.image_size, self.channels
+ channels = self.channels
+ image_size = cond.shape[-2:]
+ if cond is not None:
+ batch_size = cond.shape[0]
+ down_ratio = self.first_stage_model.down_ratio
+ if self.cfg.model_name == 'cond_unet8' or self.cfg.model_name == 'cond_unet13':
+ z, aux_out = self.sample_fn((batch_size, channels, image_size[0] // down_ratio, image_size[1] // down_ratio),
+ up_scale=up_scale, unnormalize=False, cond=cond, denoise=denoise)
+ else:
+ z = self.sample_fn((batch_size, channels, image_size[0]//down_ratio, image_size[1]//down_ratio),
+ up_scale=up_scale, unnormalize=False, cond=cond, denoise=denoise)
+ aux_out = None
+
+ if self.scale_by_std:
+ z = 1. / self.scale_factor * z.detach()
+ if self.cfg.model_name == 'cond_unet13':
+ aux_out = 1. / self.scale_factor * aux_out.detach()
+ elif self.scale_by_softsign:
+ z = z / (1 - z.abs())
+ z = z.detach()
+ #print(z.shape)
+ x_rec = self.first_stage_model.decode(z)
+ x_rec = unnormalize_to_zero_to_one(x_rec)
+ x_rec = torch.clamp(x_rec, min=0., max=1.)
+ if self.cfg.model_name == 'cond_unet13':
+ aux_out = self.first_stage_model.decode(aux_out)
+ aux_out = unnormalize_to_zero_to_one(aux_out)
+ aux_out = torch.clamp(aux_out, min=0., max=1.)
+ if mask is not None:
+ x_rec = mask * unnormalize_to_zero_to_one(cond) + (1 - mask) * x_rec
+ if aux_out is not None:
+ return x_rec, aux_out
+ return x_rec
+
+ @torch.no_grad()
+ def sample_fn(self, shape, up_scale=1, unnormalize=True, cond=None, denoise=False):
+ batch, device, total_timesteps, sampling_timesteps, eta, objective = shape[0], \
+ self.betas.device, self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
+
+ # times = torch.linspace(-1, total_timesteps, steps=self.sampling_timesteps + 1).int()
+ # times = list(reversed(times.int().tolist()))
+ # time_pairs = list(zip(times[:-1], times[1:]))
+ # time_steps = torch.tensor([0.25, 0.15, 0.1, 0.1, 0.1, 0.09, 0.075, 0.06, 0.045, 0.03])
+ step = 1. / self.sampling_timesteps
+ # time_steps = torch.tensor([0.1]).repeat(10)
+ time_steps = torch.tensor([step]).repeat(self.sampling_timesteps)
+ if denoise:
+ eps = self.eps
+ time_steps = torch.cat((time_steps[:-1], torch.tensor([step - eps]), torch.tensor([eps])), dim=0)
+
+ if self.start_dist == 'normal':
+ img = torch.randn(shape, device=device)
+ elif self.start_dist == 'uniform':
+ img = 2 * torch.rand(shape, device=device) - 1.
+ else:
+ raise NotImplementedError(f'{self.start_dist} is not supported !')
+ img = F.interpolate(img, scale_factor=up_scale, mode='bilinear', align_corners=True)
+ img_aux = F.interpolate(img.clone(), scale_factor=up_scale, mode='bilinear', align_corners=True)
+ # img_aux = img.clone()
+ # K = -1 * torch.ones_like(img)
+ cur_time = torch.ones((batch,), device=device)
+ for i, time_step in enumerate(time_steps):
+ s = torch.full((batch,), time_step, device=device)
+ if i == time_steps.shape[0] - 1:
+ s = cur_time
+ if cond is not None:
+ pred = self.model(img, cur_time, cond)
+ else:
+ pred = self.model(img, cur_time)
+ # C, noise = pred.chunk(2, dim=1)
+ C, noise = pred[:2]
+ if self.cfg.model_name == 'cond_unet8' or self.cfg.model_name == 'cond_unet13':
+ aux_out = pred[-1]
+ else:
+ aux_out = None
+ # if self.scale_by_softsign:
+ # # correct the C for softsign
+ # x0 = self.pred_x0_from_xt(img, noise, C, cur_time)
+ # x0 = torch.clamp(x0, min=-0.987654321, max=0.987654321)
+ # C = -x0
+ # correct C
+ x0 = self.pred_x0_from_xt(img, noise, C, cur_time)
+ C = -1 * x0
+ img = self.pred_xtms_from_xt(img, noise, C, cur_time, s)
+ # if self.cfg.model_name == 'cond_unet13' and i == len(time_steps) - 2:
+ # img_aux = img
+ # if self.cfg.model_name == 'cond_unet13' and i in [len(time_steps)-2, len(time_steps)-1]:
+ # x0_aux = self.pred_x0_from_xt(img_aux, noise, aux_out, cur_time)
+ # C_aux = -1 * x0_aux
+ # img_aux = self.pred_xtms_from_xt(img_aux, noise, C_aux, cur_time, s)
+ if self.cfg.model_name == 'cond_unet13':
+ for _ in range(1):
+ x0_aux = self.pred_x0_from_xt(img_aux, noise, aux_out, cur_time)
+ C_aux = -1 * x0_aux
+ img_aux = self.pred_xtms_from_xt(img_aux, noise, C_aux, cur_time, s)
+ cur_time = cur_time - s
+ if self.scale_by_softsign:
+ img.clamp_(-0.987654321, 0.987654321)
+ if unnormalize:
+ img = unnormalize_to_zero_to_one(img)
+ if self.cfg.model_name == 'cond_unet13':
+ aux_out = img_aux
+ if aux_out is not None:
+ return img, aux_out
+ return img
+
+class SpecifyGradient(torch.autograd.Function):
+ @staticmethod
+ @custom_fwd
+ def forward(ctx, input_tensor, gt_grad):
+ ctx.save_for_backward(gt_grad)
+ # we return a dummy value 1, which will be scaled by amp's scaler so we get the scale in backward.
+ return torch.ones(input_tensor.shape, device=input_tensor.device, dtype=input_tensor.dtype)
+
+ @staticmethod
+ @custom_bwd
+ def backward(ctx, grad_scale):
+ (gt_grad,) = ctx.saved_tensors
+ gt_grad = gt_grad * grad_scale
+ return gt_grad, None
+
+if __name__ == "__main__":
+ ddconfig = {'double_z': True,
+ 'z_channels': 4,
+ 'resolution': (240, 960),
+ 'in_channels': 3,
+ 'out_ch': 3,
+ 'ch': 128,
+ 'ch_mult': [1, 2, 4, 4], # num_down = len(ch_mult)-1
+ 'num_res_blocks': 2,
+ 'attn_resolutions': [],
+ 'dropout': 0.0}
+ lossconfig = {'disc_start': 50001,
+ 'kl_weight': 0.000001,
+ 'disc_weight': 0.5}
+ from encoder_decoder import AutoencoderKL
+ auto_encoder = AutoencoderKL(ddconfig, lossconfig, embed_dim=4,
+ )
+ from mask_cond_unet import Unet
+ unet = Unet(dim=64, dim_mults=(1, 2, 4, 8), channels=4, cond_in_dim=1,)
+ ldm = LatentDiffusion(auto_encoder=auto_encoder, model=unet, image_size=ddconfig['resolution'])
+ image = torch.rand(1, 3, 128, 128)
+ mask = torch.rand(1, 1, 128, 128)
+ input = {'image': image, 'cond': mask}
+ time = torch.tensor([1])
+ with torch.no_grad():
+ y = ldm.training_step(input)
+ pass
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/efficientnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cfb955fb758abce1a0145ee103ef6b672f21694
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/efficientnet.py
@@ -0,0 +1,1130 @@
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Dict, Optional, List, Sequence, Tuple, Union
+
+import torch
+from torch import nn, Tensor
+from torchvision.ops import StochasticDepth
+
+from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation
+from torchvision.transforms._presets import ImageClassification, InterpolationMode
+from torchvision.utils import _log_api_usage_once
+from torchvision.models._api import WeightsEnum, Weights
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import handle_legacy_interface, _ovewrite_named_param, _make_divisible
+
+
+__all__ = [
+ "EfficientNet",
+ "EfficientNet_B0_Weights",
+ "EfficientNet_B1_Weights",
+ "EfficientNet_B2_Weights",
+ "EfficientNet_B3_Weights",
+ "EfficientNet_B4_Weights",
+ "EfficientNet_B5_Weights",
+ "EfficientNet_B6_Weights",
+ "EfficientNet_B7_Weights",
+ "EfficientNet_V2_S_Weights",
+ "EfficientNet_V2_M_Weights",
+ "EfficientNet_V2_L_Weights",
+ "efficientnet_b0",
+ "efficientnet_b1",
+ "efficientnet_b2",
+ "efficientnet_b3",
+ "efficientnet_b4",
+ "efficientnet_b5",
+ "efficientnet_b6",
+ "efficientnet_b7",
+ "efficientnet_v2_s",
+ "efficientnet_v2_m",
+ "efficientnet_v2_l",
+]
+
+
+@dataclass
+class _MBConvConfig:
+ expand_ratio: float
+ kernel: int
+ stride: int
+ input_channels: int
+ out_channels: int
+ num_layers: int
+ block: Callable[..., nn.Module]
+
+ @staticmethod
+ def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int:
+ return _make_divisible(channels * width_mult, 8, min_value)
+
+
+class MBConvConfig(_MBConvConfig):
+ # Stores information listed at Table 1 of the EfficientNet paper & Table 4 of the EfficientNetV2 paper
+ def __init__(
+ self,
+ expand_ratio: float,
+ kernel: int,
+ stride: int,
+ input_channels: int,
+ out_channels: int,
+ num_layers: int,
+ width_mult: float = 1.0,
+ depth_mult: float = 1.0,
+ block: Optional[Callable[..., nn.Module]] = None,
+ ) -> None:
+ input_channels = self.adjust_channels(input_channels, width_mult)
+ out_channels = self.adjust_channels(out_channels, width_mult)
+ num_layers = self.adjust_depth(num_layers, depth_mult)
+ if block is None:
+ block = MBConv
+ super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+
+ @staticmethod
+ def adjust_depth(num_layers: int, depth_mult: float):
+ return int(math.ceil(num_layers * depth_mult))
+
+
+class FusedMBConvConfig(_MBConvConfig):
+ # Stores information listed at Table 4 of the EfficientNetV2 paper
+ def __init__(
+ self,
+ expand_ratio: float,
+ kernel: int,
+ stride: int,
+ input_channels: int,
+ out_channels: int,
+ num_layers: int,
+ block: Optional[Callable[..., nn.Module]] = None,
+ ) -> None:
+ if block is None:
+ block = FusedMBConv
+ super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+
+
+class MBConv(nn.Module):
+ def __init__(
+ self,
+ cnf: MBConvConfig,
+ stochastic_depth_prob: float,
+ norm_layer: Callable[..., nn.Module],
+ se_layer: Callable[..., nn.Module] = SqueezeExcitation,
+ ) -> None:
+ super().__init__()
+
+ if not (1 <= cnf.stride <= 2):
+ raise ValueError("illegal stride value")
+
+ self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+ layers: List[nn.Module] = []
+ activation_layer = nn.SiLU
+
+ # expand
+ expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+ if expanded_channels != cnf.input_channels:
+ layers.append(
+ Conv2dNormActivation(
+ cnf.input_channels,
+ expanded_channels,
+ kernel_size=1,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ # depthwise
+ layers.append(
+ Conv2dNormActivation(
+ expanded_channels,
+ expanded_channels,
+ kernel_size=cnf.kernel,
+ stride=cnf.stride,
+ groups=expanded_channels,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ # squeeze and excitation
+ squeeze_channels = max(1, cnf.input_channels // 4)
+ layers.append(se_layer(expanded_channels, squeeze_channels, activation=partial(nn.SiLU, inplace=True)))
+
+ # project
+ layers.append(
+ Conv2dNormActivation(
+ expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+ )
+ )
+
+ self.block = nn.Sequential(*layers)
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+ self.out_channels = cnf.out_channels
+
+ def forward(self, input: Tensor) -> Tensor:
+ result = self.block(input)
+ if self.use_res_connect:
+ result = self.stochastic_depth(result)
+ result += input
+ return result
+
+
+class FusedMBConv(nn.Module):
+ def __init__(
+ self,
+ cnf: FusedMBConvConfig,
+ stochastic_depth_prob: float,
+ norm_layer: Callable[..., nn.Module],
+ ) -> None:
+ super().__init__()
+
+ if not (1 <= cnf.stride <= 2):
+ raise ValueError("illegal stride value")
+
+ self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+ layers: List[nn.Module] = []
+ activation_layer = nn.SiLU
+
+ expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+ if expanded_channels != cnf.input_channels:
+ # fused expand
+ layers.append(
+ Conv2dNormActivation(
+ cnf.input_channels,
+ expanded_channels,
+ kernel_size=cnf.kernel,
+ stride=cnf.stride,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ # project
+ layers.append(
+ Conv2dNormActivation(
+ expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+ )
+ )
+ else:
+ layers.append(
+ Conv2dNormActivation(
+ cnf.input_channels,
+ cnf.out_channels,
+ kernel_size=cnf.kernel,
+ stride=cnf.stride,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ self.block = nn.Sequential(*layers)
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+ self.out_channels = cnf.out_channels
+
+ def forward(self, input: Tensor) -> Tensor:
+ result = self.block(input)
+ if self.use_res_connect:
+ result = self.stochastic_depth(result)
+ result += input
+ return result
+
+
+class EfficientNet(nn.Module):
+ def __init__(
+ self,
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+ dropout: float,
+ stochastic_depth_prob: float = 0.2,
+ num_classes: int = 1000,
+ norm_layer: Optional[Callable[..., nn.Module]] = None,
+ last_channel: Optional[int] = None,
+ **kwargs: Any,
+ ) -> None:
+ """
+ EfficientNet V1 and V2 main class
+
+ Args:
+ inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
+ dropout (float): The droupout probability
+ stochastic_depth_prob (float): The stochastic depth probability
+ num_classes (int): Number of classes
+ norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+ last_channel (int): The number of channels on the penultimate layer
+ """
+ super().__init__()
+ _log_api_usage_once(self)
+
+ if not inverted_residual_setting:
+ raise ValueError("The inverted_residual_setting should not be empty")
+ elif not (
+ isinstance(inverted_residual_setting, Sequence)
+ and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
+ ):
+ raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
+
+ if "block" in kwargs:
+ warnings.warn(
+ "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
+ "Please pass this information on 'MBConvConfig.block' instead."
+ )
+ if kwargs["block"] is not None:
+ for s in inverted_residual_setting:
+ if isinstance(s, MBConvConfig):
+ s.block = kwargs["block"]
+
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+
+ layers: List[nn.Module] = []
+
+ # building first layer
+ firstconv_output_channels = inverted_residual_setting[0].input_channels
+ # layers.append(
+ # Conv2dNormActivation(
+ # 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.SiLU
+ # )
+ # )
+ self.first_coonv = Conv2dNormActivation(
+ 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.SiLU
+ )
+
+ # building inverted residual blocks
+ total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
+ stage_block_id = 0
+ for cnf in inverted_residual_setting:
+ stage: List[nn.Module] = []
+ for _ in range(cnf.num_layers):
+ # copy to avoid modifications. shallow copy is enough
+ block_cnf = copy.copy(cnf)
+
+ # overwrite info if not the first conv in the stage
+ if stage:
+ block_cnf.input_channels = block_cnf.out_channels
+ block_cnf.stride = 1
+
+ # adjust stochastic depth probability based on the depth of the stage block
+ sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
+
+ stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
+ stage_block_id += 1
+
+ layers.append(nn.Sequential(*stage))
+
+ # building last several layers
+ lastconv_input_channels = inverted_residual_setting[-1].out_channels
+ lastconv_output_channels = last_channel if last_channel is not None else 4 * lastconv_input_channels
+ layers.append(
+ Conv2dNormActivation(
+ lastconv_input_channels,
+ lastconv_output_channels,
+ kernel_size=1,
+ norm_layer=norm_layer,
+ activation_layer=nn.SiLU,
+ )
+ )
+ # self.last_conv = Conv2dNormActivation(
+ # lastconv_input_channels,
+ # lastconv_output_channels,
+ # kernel_size=1,
+ # norm_layer=norm_layer,
+ # activation_layer=nn.SiLU,
+ # )
+
+ # self.features = nn.Sequential(*layers)
+ self.features = nn.ModuleList(layers)
+ # self.avgpool = nn.AdaptiveAvgPool2d(1)
+ # self.classifier = nn.Sequential(
+ # nn.Dropout(p=dropout, inplace=True),
+ # nn.Linear(lastconv_output_channels, num_classes),
+ # )
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out")
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+ nn.init.ones_(m.weight)
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.Linear):
+ init_range = 1.0 / math.sqrt(m.out_features)
+ nn.init.uniform_(m.weight, -init_range, init_range)
+ nn.init.zeros_(m.bias)
+
+ def _forward_impl(self, x: Tensor):
+ x = self.first_coonv(x)
+ # x = self.features(x)
+ feats = []
+ for i, layer in enumerate(self.features):
+ x = layer(x)
+ if i in [1, 2, 4, 6]:
+ feats.append(x)
+
+ # x = self.avgpool(x)
+ # x = torch.flatten(x, 1)
+ #
+ # x = self.classifier(x)
+
+ return feats
+
+ def forward(self, x: Tensor):
+ return self._forward_impl(x)
+
+
+def _efficientnet(
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+ dropout: float,
+ last_channel: Optional[int],
+ weights: Optional[WeightsEnum],
+ progress: bool,
+ **kwargs: Any,
+) -> EfficientNet:
+ if weights is not None:
+ _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+ model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
+
+ if weights is not None:
+ ckpt1 = weights.get_state_dict(progress=progress)
+ ckpt2 = model.state_dict()
+ kl1 = list(ckpt1.keys())
+ for i, k in enumerate(list(ckpt2.keys())):
+ ckpt2[k] = ckpt1[kl1[i]]
+ msg = model.load_state_dict(ckpt2, strict=False)
+ print(f'Load EfficientNet: {msg}')
+ else:
+ print('No pretrained weight loaded!')
+ return model
+
+
+def _efficientnet_conf(
+ arch: str,
+ **kwargs: Any,
+) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
+ if arch.startswith("efficientnet_b"):
+ bneck_conf = partial(MBConvConfig, width_mult=kwargs.pop("width_mult"), depth_mult=kwargs.pop("depth_mult"))
+ inverted_residual_setting = [
+ bneck_conf(1, 3, 1, 32, 16, 1),
+ bneck_conf(6, 3, 2, 16, 24, 2),
+ bneck_conf(6, 5, 2, 24, 40, 2),
+ bneck_conf(6, 3, 2, 40, 80, 3),
+ bneck_conf(6, 5, 1, 80, 112, 3),
+ bneck_conf(6, 5, 2, 112, 192, 4),
+ bneck_conf(6, 3, 1, 192, 320, 1),
+ ]
+ last_channel = None
+ elif arch.startswith("efficientnet_v2_s"):
+ inverted_residual_setting = [
+ FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+ FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+ FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+ MBConvConfig(4, 3, 2, 64, 128, 6),
+ MBConvConfig(6, 3, 1, 128, 160, 9),
+ MBConvConfig(6, 3, 2, 160, 256, 15),
+ ]
+ last_channel = 1280
+ elif arch.startswith("efficientnet_v2_m"):
+ inverted_residual_setting = [
+ FusedMBConvConfig(1, 3, 1, 24, 24, 3),
+ FusedMBConvConfig(4, 3, 2, 24, 48, 5),
+ FusedMBConvConfig(4, 3, 2, 48, 80, 5),
+ MBConvConfig(4, 3, 2, 80, 160, 7),
+ MBConvConfig(6, 3, 1, 160, 176, 14),
+ MBConvConfig(6, 3, 2, 176, 304, 18),
+ MBConvConfig(6, 3, 1, 304, 512, 5),
+ ]
+ last_channel = 1280
+ elif arch.startswith("efficientnet_v2_l"):
+ inverted_residual_setting = [
+ FusedMBConvConfig(1, 3, 1, 32, 32, 4),
+ FusedMBConvConfig(4, 3, 2, 32, 64, 7),
+ FusedMBConvConfig(4, 3, 2, 64, 96, 7),
+ MBConvConfig(4, 3, 2, 96, 192, 10),
+ MBConvConfig(6, 3, 1, 192, 224, 19),
+ MBConvConfig(6, 3, 2, 224, 384, 25),
+ MBConvConfig(6, 3, 1, 384, 640, 7),
+ ]
+ last_channel = 1280
+ else:
+ raise ValueError(f"Unsupported model type {arch}")
+
+ return inverted_residual_setting, last_channel
+
+
+_COMMON_META: Dict[str, Any] = {
+ "categories": _IMAGENET_CATEGORIES,
+}
+
+
+_COMMON_META_V1 = {
+ **_COMMON_META,
+ "min_size": (1, 1),
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#efficientnet-v1",
+}
+
+
+_COMMON_META_V2 = {
+ **_COMMON_META,
+ "min_size": (33, 33),
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#efficientnet-v2",
+}
+
+
+class EfficientNet_B0_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/rwightman/pytorch-image-models/
+ url="https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
+ transforms=partial(
+ ImageClassification, crop_size=224, resize_size=256, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 5288548,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 77.692,
+ "acc@5": 93.532,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B1_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/rwightman/pytorch-image-models/
+ url="https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+ transforms=partial(
+ ImageClassification, crop_size=240, resize_size=256, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 7794184,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 78.642,
+ "acc@5": 94.186,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/efficientnet_b1-c27df63c.pth",
+ transforms=partial(
+ ImageClassification, crop_size=240, resize_size=255, interpolation=InterpolationMode.BILINEAR
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 7794184,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-lr-wd-crop-tuning",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 79.838,
+ "acc@5": 94.934,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using a modified version of TorchVision's
+ `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class EfficientNet_B2_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/rwightman/pytorch-image-models/
+ url="https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
+ transforms=partial(
+ ImageClassification, crop_size=288, resize_size=288, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 9109994,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 80.608,
+ "acc@5": 95.310,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B3_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/rwightman/pytorch-image-models/
+ url="https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
+ transforms=partial(
+ ImageClassification, crop_size=300, resize_size=320, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 12233232,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 82.008,
+ "acc@5": 96.054,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B4_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/rwightman/pytorch-image-models/
+ url="https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+ transforms=partial(
+ ImageClassification, crop_size=380, resize_size=384, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 19341616,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 83.384,
+ "acc@5": 96.594,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B5_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+ url="https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
+ transforms=partial(
+ ImageClassification, crop_size=456, resize_size=456, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 30389784,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 83.444,
+ "acc@5": 96.628,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B6_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+ url="https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
+ transforms=partial(
+ ImageClassification, crop_size=528, resize_size=528, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 43040704,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 84.008,
+ "acc@5": 96.916,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_B7_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+ url="https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
+ transforms=partial(
+ ImageClassification, crop_size=600, resize_size=600, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META_V1,
+ "num_params": 66347960,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 84.122,
+ "acc@5": 96.908,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_V2_S_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth",
+ transforms=partial(
+ ImageClassification,
+ crop_size=384,
+ resize_size=384,
+ interpolation=InterpolationMode.BILINEAR,
+ ),
+ meta={
+ **_COMMON_META_V2,
+ "num_params": 21458488,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 84.228,
+ "acc@5": 96.878,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using a modified version of TorchVision's
+ `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_V2_M_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/efficientnet_v2_m-dc08266a.pth",
+ transforms=partial(
+ ImageClassification,
+ crop_size=480,
+ resize_size=480,
+ interpolation=InterpolationMode.BILINEAR,
+ ),
+ meta={
+ **_COMMON_META_V2,
+ "num_params": 54139356,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 85.112,
+ "acc@5": 97.156,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using a modified version of TorchVision's
+ `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class EfficientNet_V2_L_Weights(WeightsEnum):
+ # Weights ported from https://github.com/google/automl/tree/master/efficientnetv2
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/efficientnet_v2_l-59c71312.pth",
+ transforms=partial(
+ ImageClassification,
+ crop_size=480,
+ resize_size=480,
+ interpolation=InterpolationMode.BICUBIC,
+ mean=(0.5, 0.5, 0.5),
+ std=(0.5, 0.5, 0.5),
+ ),
+ meta={
+ **_COMMON_META_V2,
+ "num_params": 118515272,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 85.808,
+ "acc@5": 97.788,
+ }
+ },
+ "_docs": """These weights are ported from the original paper.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B0_Weights.IMAGENET1K_V1))
+def efficientnet_b0(
+ *, weights: Optional[EfficientNet_B0_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B0 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B0_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B0_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B0_Weights
+ :members:
+ """
+ weights = EfficientNet_B0_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b0", width_mult=1.0, depth_mult=1.0)
+ return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B1_Weights.IMAGENET1K_V1))
+def efficientnet_b1(
+ *, weights: Optional[EfficientNet_B1_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B1 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B1_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B1_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B1_Weights
+ :members:
+ """
+ weights = EfficientNet_B1_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b1", width_mult=1.0, depth_mult=1.1)
+ return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B2_Weights.IMAGENET1K_V1))
+def efficientnet_b2(
+ *, weights: Optional[EfficientNet_B2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B2 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B2_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B2_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B2_Weights
+ :members:
+ """
+ weights = EfficientNet_B2_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b2", width_mult=1.1, depth_mult=1.2)
+ return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B3_Weights.IMAGENET1K_V1))
+def efficientnet_b3(
+ *, weights: Optional[EfficientNet_B3_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B3 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B3_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B3_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B3_Weights
+ :members:
+ """
+ weights = EfficientNet_B3_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b3", width_mult=1.2, depth_mult=1.4)
+ return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B4_Weights.IMAGENET1K_V1))
+def efficientnet_b4(
+ *, weights: Optional[EfficientNet_B4_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B4 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B4_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B4_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B4_Weights
+ :members:
+ """
+ weights = EfficientNet_B4_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b4", width_mult=1.4, depth_mult=1.8)
+ return _efficientnet(inverted_residual_setting, 0.4, last_channel, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B5_Weights.IMAGENET1K_V1))
+def efficientnet_b5(
+ *, weights: Optional[EfficientNet_B5_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B5 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B5_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B5_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B5_Weights
+ :members:
+ """
+ weights = EfficientNet_B5_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b5", width_mult=1.6, depth_mult=2.2)
+ return _efficientnet(
+ inverted_residual_setting,
+ 0.4,
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+ **kwargs,
+ )
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B6_Weights.IMAGENET1K_V1))
+def efficientnet_b6(
+ *, weights: Optional[EfficientNet_B6_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B6 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B6_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B6_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B6_Weights
+ :members:
+ """
+ weights = EfficientNet_B6_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b6", width_mult=1.8, depth_mult=2.6)
+ return _efficientnet(
+ inverted_residual_setting,
+ 0.5,
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+ **kwargs,
+ )
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_B7_Weights.IMAGENET1K_V1))
+def efficientnet_b7(
+ *, weights: Optional[EfficientNet_B7_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """EfficientNet B7 model architecture from the `EfficientNet: Rethinking Model Scaling for Convolutional
+ Neural Networks `_ paper.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_B7_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_B7_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_B7_Weights
+ :members:
+ """
+ weights = EfficientNet_B7_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b7", width_mult=2.0, depth_mult=3.1)
+ return _efficientnet(
+ inverted_residual_setting,
+ 0.5,
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+ **kwargs,
+ )
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_V2_S_Weights.IMAGENET1K_V1))
+def efficientnet_v2_s(
+ *, weights: Optional[EfficientNet_V2_S_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """
+ Constructs an EfficientNetV2-S architecture from
+ `EfficientNetV2: Smaller Models and Faster Training `_.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_V2_S_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_V2_S_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_V2_S_Weights
+ :members:
+ """
+ weights = EfficientNet_V2_S_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
+ return _efficientnet(
+ inverted_residual_setting,
+ 0.2,
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
+ **kwargs,
+ )
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_V2_M_Weights.IMAGENET1K_V1))
+def efficientnet_v2_m(
+ *, weights: Optional[EfficientNet_V2_M_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """
+ Constructs an EfficientNetV2-M architecture from
+ `EfficientNetV2: Smaller Models and Faster Training `_.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_V2_M_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_V2_M_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_V2_M_Weights
+ :members:
+ """
+ weights = EfficientNet_V2_M_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_m")
+ return _efficientnet(
+ inverted_residual_setting,
+ 0.3,
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
+ **kwargs,
+ )
+
+
+@handle_legacy_interface(weights=("pretrained", EfficientNet_V2_L_Weights.IMAGENET1K_V1))
+def efficientnet_v2_l(
+ *, weights: Optional[EfficientNet_V2_L_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ """
+ Constructs an EfficientNetV2-L architecture from
+ `EfficientNetV2: Smaller Models and Faster Training `_.
+
+ Args:
+ weights (:class:`~torchvision.models.EfficientNet_V2_L_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.EfficientNet_V2_L_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.efficientnet.EfficientNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.EfficientNet_V2_L_Weights
+ :members:
+ """
+ weights = EfficientNet_V2_L_Weights.verify(weights)
+
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_l")
+ return _efficientnet(
+ inverted_residual_setting,
+ 0.4,
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
+ **kwargs,
+ )
+
+
+# The dictionary below is internal implementation detail and will be removed in v0.15
+from torchvision.models._utils import _ModelURLs
+
+
+model_urls = _ModelURLs(
+ {
+ "efficientnet_b0": EfficientNet_B0_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b1": EfficientNet_B1_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b2": EfficientNet_B2_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b3": EfficientNet_B3_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b4": EfficientNet_B4_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b5": EfficientNet_B5_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b6": EfficientNet_B6_Weights.IMAGENET1K_V1.url,
+ "efficientnet_b7": EfficientNet_B7_Weights.IMAGENET1K_V1.url,
+ }
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/ema.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..131aca0e5f1190facb168dd34fda59a0abfdd68f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/ema.py
@@ -0,0 +1,191 @@
+import copy
+import torch
+from torch import nn
+
+
+def exists(val):
+ return val is not None
+
+
+def clamp(value, min_value=None, max_value=None):
+ assert exists(min_value) or exists(max_value)
+ if exists(min_value):
+ value = max(value, min_value)
+
+ if exists(max_value):
+ value = min(value, max_value)
+
+ return value
+
+
+class EMA(nn.Module):
+ """
+ Implements exponential moving average shadowing for your model.
+
+ Utilizes an inverse decay schedule to manage longer term training runs.
+ By adjusting the power, you can control how fast EMA will ramp up to your specified beta.
+
+ @crowsonkb's notes on EMA Warmup:
+
+ If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are
+ good values for models you plan to train for a million or more steps (reaches decay
+ factor 0.999 at 31.6K steps, 0.9999 at 1M steps), gamma=1, power=3/4 for models
+ you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at
+ 215.4k steps).
+
+ Args:
+ inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+ power (float): Exponential factor of EMA warmup. Default: 1.
+ min_value (float): The minimum EMA decay rate. Default: 0.
+ """
+
+ def __init__(
+ self,
+ model,
+ ema_model=None,
+ # if your model has lazylinears or other types of non-deepcopyable modules, you can pass in your own ema model
+ beta=0.9999,
+ update_after_step=100,
+ update_every=10,
+ inv_gamma=1.0,
+ power=2 / 3,
+ min_value=0.0,
+ param_or_buffer_names_no_ema=set(),
+ ignore_names=set(),
+ ignore_startswith_names=set(),
+ include_online_model=True
+ # set this to False if you do not wish for the online model to be saved along with the ema model (managed externally)
+ ):
+ super().__init__()
+ self.beta = beta
+
+ # whether to include the online model within the module tree, so that state_dict also saves it
+
+ self.include_online_model = include_online_model
+
+ if include_online_model:
+ self.online_model = model
+ else:
+ self.online_model = [model] # hack
+
+ # ema model
+
+ self.ema_model = ema_model
+
+ if not exists(self.ema_model):
+ try:
+ self.ema_model = copy.deepcopy(model)
+ except:
+ print('Your model was not copyable. Please make sure you are not using any LazyLinear')
+ exit()
+
+ self.ema_model.requires_grad_(False)
+
+ self.parameter_names = {name for name, param in self.ema_model.named_parameters() if param.dtype == torch.float}
+ self.buffer_names = {name for name, buffer in self.ema_model.named_buffers() if buffer.dtype == torch.float}
+
+ self.update_every = update_every
+ self.update_after_step = update_after_step
+
+ self.inv_gamma = inv_gamma
+ self.power = power
+ self.min_value = min_value
+
+ assert isinstance(param_or_buffer_names_no_ema, (set, list))
+ self.param_or_buffer_names_no_ema = param_or_buffer_names_no_ema # parameter or buffer
+
+ self.ignore_names = ignore_names
+ self.ignore_startswith_names = ignore_startswith_names
+
+ self.register_buffer('initted', torch.Tensor([False]))
+ self.register_buffer('step', torch.tensor([0]))
+
+ @property
+ def model(self):
+ return self.online_model if self.include_online_model else self.online_model[0]
+
+ def restore_ema_model_device(self):
+ device = self.initted.device
+ self.ema_model.to(device)
+
+ def get_params_iter(self, model):
+ for name, param in model.named_parameters():
+ if name not in self.parameter_names:
+ continue
+ yield name, param
+
+ def get_buffers_iter(self, model):
+ for name, buffer in model.named_buffers():
+ if name not in self.buffer_names:
+ continue
+ yield name, buffer
+
+ def copy_params_from_model_to_ema(self):
+ for (_, ma_params), (_, current_params) in zip(self.get_params_iter(self.ema_model),
+ self.get_params_iter(self.model)):
+ ma_params.data.copy_(current_params.data)
+
+ for (_, ma_buffers), (_, current_buffers) in zip(self.get_buffers_iter(self.ema_model),
+ self.get_buffers_iter(self.model)):
+ ma_buffers.data.copy_(current_buffers.data)
+
+ def get_current_decay(self):
+ epoch = clamp(self.step.item() - self.update_after_step - 1, min_value=0.)
+ value = 1 - (1 + epoch / self.inv_gamma) ** - self.power
+
+ if epoch <= 0:
+ return 0.
+
+ return clamp(value, min_value=self.min_value, max_value=self.beta)
+
+ def update(self):
+ step = self.step.item()
+ self.step += 1
+
+ if (step % self.update_every) != 0:
+ return
+
+ if step <= self.update_after_step:
+ self.copy_params_from_model_to_ema()
+ return
+
+ if not self.initted.item():
+ self.copy_params_from_model_to_ema()
+ self.initted.data.copy_(torch.Tensor([True]))
+
+ self.update_moving_average(self.ema_model, self.model)
+
+ @torch.no_grad()
+ def update_moving_average(self, ma_model, current_model):
+ current_decay = self.get_current_decay()
+
+ for (name, current_params), (_, ma_params) in zip(self.get_params_iter(current_model),
+ self.get_params_iter(ma_model)):
+ if name in self.ignore_names:
+ continue
+
+ if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+ continue
+
+ if name in self.param_or_buffer_names_no_ema:
+ ma_params.data.copy_(current_params.data)
+ continue
+
+ ma_params.data.lerp_(current_params.data, 1. - current_decay)
+
+ for (name, current_buffer), (_, ma_buffer) in zip(self.get_buffers_iter(current_model),
+ self.get_buffers_iter(ma_model)):
+ if name in self.ignore_names:
+ continue
+
+ if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+ continue
+
+ if name in self.param_or_buffer_names_no_ema:
+ ma_buffer.data.copy_(current_buffer.data)
+ continue
+
+ ma_buffer.data.lerp_(current_buffer.data, 1. - current_decay)
+
+ def __call__(self, *args, **kwargs):
+ return self.ema_model(*args, **kwargs)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/encoder_decoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fded5b07b956155ec548e1e78f015ee388634cf0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/encoder_decoder.py
@@ -0,0 +1,1086 @@
+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+from .loss import LPIPSWithDiscriminator
+
+# from ldm.util import instantiate_from_config
+# from ldm.modules.attention import LinearAttention
+
+class LinearAttention(nn.Module):
+ def __init__(self, dim, heads=4, dim_head=32):
+ super().__init__()
+ self.heads = heads
+ hidden_dim = dim_head * heads
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+ self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+ qkv = self.to_qkv(x)
+ q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+ k = k.softmax(dim=-1)
+ context = torch.einsum('bhdn,bhen->bhde', k, v)
+ out = torch.einsum('bhde,bhdn->bhen', context, q)
+ out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+ return self.to_out(out)
+
+def get_timestep_embedding(timesteps, embedding_dim):
+ """
+ This matches the implementation in Denoising Diffusion Probabilistic Models:
+ From Fairseq.
+ Build sinusoidal embeddings.
+ This matches the implementation in tensor2tensor, but differs slightly
+ from the description in Section 3.5 of "Attention Is All You Need".
+ """
+ assert len(timesteps.shape) == 1
+
+ half_dim = embedding_dim // 2
+ emb = math.log(10000) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+ emb = emb.to(device=timesteps.device)
+ emb = timesteps.float()[:, None] * emb[None, :]
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+ if embedding_dim % 2 == 1: # zero pad
+ emb = torch.nn.functional.pad(emb, (0,1,0,0))
+ return emb
+
+
+def nonlinearity(x):
+ # swish
+ return x*torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+ return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class Upsample(nn.Module):
+ def __init__(self, in_channels, with_conv):
+ super().__init__()
+ self.with_conv = with_conv
+ if self.with_conv:
+ self.conv = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+ if self.with_conv:
+ x = self.conv(x)
+ return x
+
+
+class Downsample(nn.Module):
+ def __init__(self, in_channels, with_conv):
+ super().__init__()
+ self.with_conv = with_conv
+ if self.with_conv:
+ # no asymmetric padding in torch conv, must do it ourselves
+ self.conv = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=3,
+ stride=2,
+ padding=0)
+
+ def forward(self, x):
+ if self.with_conv:
+ pad = (0,1,0,1)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ else:
+ x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+ return x
+
+
+class ResnetBlock(nn.Module):
+ def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+ dropout, temb_channels=512):
+ super().__init__()
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+
+ self.norm1 = Normalize(in_channels)
+ self.conv1 = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ if temb_channels > 0:
+ self.temb_proj = torch.nn.Linear(temb_channels,
+ out_channels)
+ self.norm2 = Normalize(out_channels)
+ self.dropout = torch.nn.Dropout(dropout)
+ self.conv2 = torch.nn.Conv2d(out_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ self.conv_shortcut = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ else:
+ self.nin_shortcut = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+
+ def forward(self, x, temb):
+ h = x
+ h = self.norm1(h)
+ h = nonlinearity(h)
+ h = self.conv1(h)
+
+ if temb is not None:
+ h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+
+ h = self.norm2(h)
+ h = nonlinearity(h)
+ h = self.dropout(h)
+ h = self.conv2(h)
+
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ x = self.conv_shortcut(x)
+ else:
+ x = self.nin_shortcut(x)
+
+ return x+h
+
+
+class LinAttnBlock(LinearAttention):
+ """to match AttnBlock usage"""
+ def __init__(self, in_channels):
+ super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+class AttnBlock(nn.Module):
+ def __init__(self, in_channels):
+ super().__init__()
+ self.in_channels = in_channels
+
+ self.norm = Normalize(in_channels)
+ self.q = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.k = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.v = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.proj_out = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+
+
+ def forward(self, x):
+ h_ = x
+ h_ = self.norm(h_)
+ q = self.q(h_)
+ k = self.k(h_)
+ v = self.v(h_)
+
+ # compute attention
+ b,c,h,w = q.shape
+ q = q.reshape(b,c,h*w)
+ q = q.permute(0,2,1) # b,hw,c
+ k = k.reshape(b,c,h*w) # b,c,hw
+ w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+ w_ = w_ * (int(c)**(-0.5))
+ w_ = torch.nn.functional.softmax(w_, dim=2)
+
+ # attend to values
+ v = v.reshape(b,c,h*w)
+ w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
+ h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+ h_ = h_.reshape(b,c,h,w)
+
+ h_ = self.proj_out(h_)
+
+ return x+h_
+
+
+def make_attn(in_channels, attn_type="vanilla"):
+ assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+ print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+ if attn_type == "vanilla":
+ return AttnBlock(in_channels)
+ elif attn_type == "none":
+ return nn.Identity(in_channels)
+ else:
+ return LinAttnBlock(in_channels)
+
+
+class Model(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+ resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
+ super().__init__()
+ if use_linear_attn: attn_type = "linear"
+ self.ch = ch
+ self.temb_ch = self.ch*4
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+ self.in_channels = in_channels
+
+ self.use_timestep = use_timestep
+ if self.use_timestep:
+ # timestep embedding
+ self.temb = nn.Module()
+ self.temb.dense = nn.ModuleList([
+ torch.nn.Linear(self.ch,
+ self.temb_ch),
+ torch.nn.Linear(self.temb_ch,
+ self.temb_ch),
+ ])
+
+ # downsampling
+ self.conv_in = torch.nn.Conv2d(in_channels,
+ self.ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ curr_res = resolution
+ in_ch_mult = (1,)+tuple(ch_mult)
+ self.down = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_in = ch*in_ch_mult[i_level]
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(make_attn(block_in, attn_type=attn_type))
+ down = nn.Module()
+ down.block = block
+ down.attn = attn
+ if i_level != self.num_resolutions-1:
+ down.downsample = Downsample(block_in, resamp_with_conv)
+ curr_res = curr_res // 2
+ self.down.append(down)
+
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # upsampling
+ self.up = nn.ModuleList()
+ for i_level in reversed(range(self.num_resolutions)):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_out = ch*ch_mult[i_level]
+ skip_in = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks+1):
+ if i_block == self.num_res_blocks:
+ skip_in = ch*in_ch_mult[i_level]
+ block.append(ResnetBlock(in_channels=block_in+skip_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(make_attn(block_in, attn_type=attn_type))
+ up = nn.Module()
+ up.block = block
+ up.attn = attn
+ if i_level != 0:
+ up.upsample = Upsample(block_in, resamp_with_conv)
+ curr_res = curr_res * 2
+ self.up.insert(0, up) # prepend to get consistent order
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x, t=None, context=None):
+ #assert x.shape[2] == x.shape[3] == self.resolution
+ if context is not None:
+ # assume aligned context, cat along channel axis
+ x = torch.cat((x, context), dim=1)
+ if self.use_timestep:
+ # timestep embedding
+ assert t is not None
+ temb = get_timestep_embedding(t, self.ch)
+ temb = self.temb.dense[0](temb)
+ temb = nonlinearity(temb)
+ temb = self.temb.dense[1](temb)
+ else:
+ temb = None
+
+ # downsampling
+ hs = [self.conv_in(x)]
+ for i_level in range(self.num_resolutions):
+ for i_block in range(self.num_res_blocks):
+ h = self.down[i_level].block[i_block](hs[-1], temb)
+ if len(self.down[i_level].attn) > 0:
+ h = self.down[i_level].attn[i_block](h)
+ hs.append(h)
+ if i_level != self.num_resolutions-1:
+ hs.append(self.down[i_level].downsample(hs[-1]))
+
+ # middle
+ h = hs[-1]
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # upsampling
+ for i_level in reversed(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks+1):
+ h = self.up[i_level].block[i_block](
+ torch.cat([h, hs.pop()], dim=1), temb)
+ if len(self.up[i_level].attn) > 0:
+ h = self.up[i_level].attn[i_block](h)
+ if i_level != 0:
+ h = self.up[i_level].upsample(h)
+
+ # end
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+ def get_last_layer(self):
+ return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+ resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+ **ignore_kwargs):
+ super().__init__()
+ if use_linear_attn: attn_type = "linear"
+ self.ch = ch
+ self.temb_ch = 0
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+ self.in_channels = in_channels
+
+ # downsampling
+ self.conv_in = torch.nn.Conv2d(in_channels,
+ self.ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ curr_res = resolution
+ in_ch_mult = (1,)+tuple(ch_mult)
+ self.in_ch_mult = in_ch_mult
+ self.down = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_in = ch*in_ch_mult[i_level]
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(make_attn(block_in, attn_type=attn_type))
+ down = nn.Module()
+ down.block = block
+ down.attn = attn
+ if i_level != self.num_resolutions-1:
+ down.downsample = Downsample(block_in, resamp_with_conv)
+ curr_res = (curr_res[0] // 2, curr_res[1] // 2)
+ self.down.append(down)
+
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ 2*z_channels if double_z else z_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ # timestep embedding
+ temb = None
+
+ # downsampling
+ hs = [self.conv_in(x)]
+ for i_level in range(self.num_resolutions):
+ for i_block in range(self.num_res_blocks):
+ h = self.down[i_level].block[i_block](hs[-1], temb)
+ if len(self.down[i_level].attn) > 0:
+ h = self.down[i_level].attn[i_block](h)
+ hs.append(h)
+ if i_level != self.num_resolutions-1:
+ hs.append(self.down[i_level].downsample(hs[-1]))
+
+ # middle
+ h = hs[-1]
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # end
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class Decoder(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+ resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+ attn_type="vanilla", **ignorekwargs):
+ super().__init__()
+ if use_linear_attn: attn_type = "linear"
+ self.ch = ch
+ self.temb_ch = 0
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+ self.in_channels = in_channels
+ self.give_pre_end = give_pre_end
+ self.tanh_out = tanh_out
+
+ # compute in_ch_mult, block_in and curr_res at lowest res
+ in_ch_mult = (1,)+tuple(ch_mult)
+ block_in = ch*ch_mult[self.num_resolutions-1]
+ curr_res = (resolution[0] // 2**(self.num_resolutions-1), resolution[1] // 2**(self.num_resolutions-1))
+ self.z_shape = (1,z_channels,curr_res[0],curr_res[1])
+ print("Working with z of shape {} = {} dimensions.".format(
+ self.z_shape, np.prod(self.z_shape)))
+
+ # z to block_in
+ self.conv_in = torch.nn.Conv2d(z_channels,
+ block_in,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # upsampling
+ self.up = nn.ModuleList()
+ for i_level in reversed(range(self.num_resolutions)):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks+1):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(make_attn(block_in, attn_type=attn_type))
+ up = nn.Module()
+ up.block = block
+ up.attn = attn
+ if i_level != 0:
+ up.upsample = Upsample(block_in, resamp_with_conv)
+ curr_res = (curr_res[0] * 2, curr_res[1] * 2)
+ self.up.insert(0, up) # prepend to get consistent order
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, z):
+ #assert z.shape[1:] == self.z_shape[1:]
+ self.last_z_shape = z.shape
+
+ # timestep embedding
+ temb = None
+
+ # z to block_in
+ h = self.conv_in(z)
+
+ # middle
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # upsampling
+ for i_level in reversed(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks+1):
+ h = self.up[i_level].block[i_block](h, temb)
+ if len(self.up[i_level].attn) > 0:
+ h = self.up[i_level].attn[i_block](h)
+ if i_level != 0:
+ h = self.up[i_level].upsample(h)
+
+ # end
+ if self.give_pre_end:
+ return h
+
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ if self.tanh_out:
+ h = torch.tanh(h)
+ return h
+
+
+class SimpleDecoder(nn.Module):
+ def __init__(self, in_channels, out_channels, *args, **kwargs):
+ super().__init__()
+ self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+ ResnetBlock(in_channels=in_channels,
+ out_channels=2 * in_channels,
+ temb_channels=0, dropout=0.0),
+ ResnetBlock(in_channels=2 * in_channels,
+ out_channels=4 * in_channels,
+ temb_channels=0, dropout=0.0),
+ ResnetBlock(in_channels=4 * in_channels,
+ out_channels=2 * in_channels,
+ temb_channels=0, dropout=0.0),
+ nn.Conv2d(2*in_channels, in_channels, 1),
+ Upsample(in_channels, with_conv=True)])
+ # end
+ self.norm_out = Normalize(in_channels)
+ self.conv_out = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ for i, layer in enumerate(self.model):
+ if i in [1,2,3]:
+ x = layer(x, None)
+ else:
+ x = layer(x)
+
+ h = self.norm_out(x)
+ h = nonlinearity(h)
+ x = self.conv_out(h)
+ return x
+
+
+class UpsampleDecoder(nn.Module):
+ def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+ ch_mult=(2,2), dropout=0.0):
+ super().__init__()
+ # upsampling
+ self.temb_ch = 0
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ block_in = in_channels
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
+ self.res_blocks = nn.ModuleList()
+ self.upsample_blocks = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ res_block = []
+ block_out = ch * ch_mult[i_level]
+ for i_block in range(self.num_res_blocks + 1):
+ res_block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ self.res_blocks.append(nn.ModuleList(res_block))
+ if i_level != self.num_resolutions - 1:
+ self.upsample_blocks.append(Upsample(block_in, True))
+ curr_res = curr_res * 2
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ # upsampling
+ h = x
+ for k, i_level in enumerate(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks + 1):
+ h = self.res_blocks[i_level][i_block](h, None)
+ if i_level != self.num_resolutions - 1:
+ h = self.upsample_blocks[k](h)
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class LatentRescaler(nn.Module):
+ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
+ super().__init__()
+ # residual block, interpolate, residual block
+ self.factor = factor
+ self.conv_in = nn.Conv2d(in_channels,
+ mid_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
+ out_channels=mid_channels,
+ temb_channels=0,
+ dropout=0.0) for _ in range(depth)])
+ self.attn = AttnBlock(mid_channels)
+ self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
+ out_channels=mid_channels,
+ temb_channels=0,
+ dropout=0.0) for _ in range(depth)])
+
+ self.conv_out = nn.Conv2d(mid_channels,
+ out_channels,
+ kernel_size=1,
+ )
+
+ def forward(self, x):
+ x = self.conv_in(x)
+ for block in self.res_block1:
+ x = block(x, None)
+ x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
+ x = self.attn(x)
+ for block in self.res_block2:
+ x = block(x, None)
+ x = self.conv_out(x)
+ return x
+
+
+class MergedRescaleEncoder(nn.Module):
+ def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True,
+ ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
+ super().__init__()
+ intermediate_chn = ch * ch_mult[-1]
+ self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
+ z_channels=intermediate_chn, double_z=False, resolution=resolution,
+ attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
+ out_ch=None)
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
+ mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
+
+ def forward(self, x):
+ x = self.encoder(x)
+ x = self.rescaler(x)
+ return x
+
+
+class MergedRescaleDecoder(nn.Module):
+ def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
+ dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
+ super().__init__()
+ tmp_chn = z_channels*ch_mult[-1]
+ self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
+ resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
+ ch_mult=ch_mult, resolution=resolution, ch=ch)
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
+ out_channels=tmp_chn, depth=rescale_module_depth)
+
+ def forward(self, x):
+ x = self.rescaler(x)
+ x = self.decoder(x)
+ return x
+
+
+class Upsampler(nn.Module):
+ def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
+ super().__init__()
+ assert out_size >= in_size
+ num_blocks = int(np.log2(out_size//in_size))+1
+ factor_up = 1.+ (out_size % in_size)
+ print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
+ self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
+ out_channels=in_channels)
+ self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
+ attn_resolutions=[], in_channels=None, ch=in_channels,
+ ch_mult=[ch_mult for _ in range(num_blocks)])
+
+ def forward(self, x):
+ x = self.rescaler(x)
+ x = self.decoder(x)
+ return x
+
+
+class Resize(nn.Module):
+ def __init__(self, in_channels=None, learned=False, mode="bilinear"):
+ super().__init__()
+ self.with_conv = learned
+ self.mode = mode
+ if self.with_conv:
+ print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
+ raise NotImplementedError()
+ assert in_channels is not None
+ # no asymmetric padding in torch conv, must do it ourselves
+ self.conv = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=4,
+ stride=2,
+ padding=1)
+
+ def forward(self, x, scale_factor=1.0):
+ if scale_factor==1.0:
+ return x
+ else:
+ x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
+ return x
+
+class FirstStagePostProcessor(nn.Module):
+
+ def __init__(self, ch_mult:list, in_channels,
+ pretrained_model:nn.Module=None,
+ reshape=False,
+ n_channels=None,
+ dropout=0.,
+ pretrained_config=None):
+ super().__init__()
+ if pretrained_config is None:
+ assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+ self.pretrained_model = pretrained_model
+ else:
+ assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+ self.instantiate_pretrained(pretrained_config)
+
+ self.do_reshape = reshape
+
+ if n_channels is None:
+ n_channels = self.pretrained_model.encoder.ch
+
+ self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
+ self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
+ stride=1,padding=1)
+
+ blocks = []
+ downs = []
+ ch_in = n_channels
+ for m in ch_mult:
+ blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
+ ch_in = m * n_channels
+ downs.append(Downsample(ch_in, with_conv=False))
+
+ self.model = nn.ModuleList(blocks)
+ self.downsampler = nn.ModuleList(downs)
+
+
+ def instantiate_pretrained(self, config):
+ model = instantiate_from_config(config)
+ self.pretrained_model = model.eval()
+ # self.pretrained_model.train = False
+ for param in self.pretrained_model.parameters():
+ param.requires_grad = False
+
+
+ @torch.no_grad()
+ def encode_with_pretrained(self,x):
+ c = self.pretrained_model.encode(x)
+ if isinstance(c, DiagonalGaussianDistribution):
+ c = c.mode()
+ return c
+
+ def forward(self,x):
+ z_fs = self.encode_with_pretrained(x)
+ z = self.proj_norm(z_fs)
+ z = self.proj(z)
+ z = nonlinearity(z)
+
+ for submodel, downmodel in zip(self.model,self.downsampler):
+ z = submodel(z,temb=None)
+ z = downmodel(z)
+
+ if self.do_reshape:
+ z = rearrange(z,'b c h w -> b (h w) c')
+ return z
+
+class DiagonalGaussianDistribution(object):
+ def __init__(self, parameters, deterministic=False):
+ self.parameters = parameters
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+ self.deterministic = deterministic
+ self.std = torch.exp(0.5 * self.logvar)
+ self.var = torch.exp(self.logvar)
+ if self.deterministic:
+ self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+
+ def sample(self):
+ x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+ return x
+
+ def kl(self, other=None):
+ if self.deterministic:
+ return torch.Tensor([0.])
+ else:
+ if other is None:
+ return 0.5 * torch.sum(torch.pow(self.mean, 2)
+ + self.var - 1.0 - self.logvar,
+ dim=[1, 2, 3])
+ else:
+ return 0.5 * torch.sum(
+ torch.pow(self.mean - other.mean, 2) / other.var
+ + self.var / other.var - 1.0 - self.logvar + other.logvar,
+ dim=[1, 2, 3])
+
+ def nll(self, sample, dims=[1,2,3]):
+ if self.deterministic:
+ return torch.Tensor([0.])
+ logtwopi = np.log(2.0 * np.pi)
+ return 0.5 * torch.sum(
+ logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+ dim=dims)
+
+ def mode(self):
+ return self.mean
+
+class AutoencoderKL(nn.Module):
+ def __init__(self,
+ ddconfig,
+ lossconfig,
+ embed_dim,
+ ckpt_path=None,
+ ignore_keys=[],
+ image_key="image",
+ colorize_nlabels=None,
+ monitor=None,
+ ):
+ super().__init__()
+ self.image_key = image_key
+ self.encoder = Encoder(**ddconfig)
+ self.decoder = Decoder(**ddconfig)
+ self.down_ratio = 2 ** (len(ddconfig['ch_mult']) - 1)
+ self.loss = LPIPSWithDiscriminator(**lossconfig)
+ assert ddconfig["double_z"]
+ self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+ self.embed_dim = embed_dim
+ if colorize_nlabels is not None:
+ assert type(colorize_nlabels)==int
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+ if monitor is not None:
+ self.monitor = monitor
+ if ckpt_path is not None:
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+
+ def init_from_ckpt(self, path, ignore_keys=list(), use_ema=True):
+ sd = torch.load(path, map_location="cpu")
+ sd_keys = sd.keys()
+ if 'ema' in list(sd.keys()) and use_ema:
+ sd = sd['ema']
+ new_sd = {}
+ for k in sd.keys():
+ if k.startswith("ema_model."):
+ new_k = k[10:] # remove ema_model.
+ new_sd[new_k] = sd[k]
+ sd = new_sd
+ else:
+ if 'model' in sd_keys:
+ sd = sd["model"]
+ elif 'state_dict' in sd_keys:
+ sd = sd['state_dict']
+ else:
+ sd = sd
+ # raise ValueError("")
+ keys = list(sd.keys())
+ for k in keys:
+ for ik in ignore_keys:
+ if k.startswith(ik):
+ print("Deleting key {} from state_dict.".format(k))
+ del sd[k]
+ msg = self.load_state_dict(sd, strict=False)
+ print(f"Restored from {path}")
+ print('==>Load AutoEncoder Info: ', msg)
+
+ def encode(self, x):
+ h = self.encoder(x)
+ moments = self.quant_conv(h)
+ posterior = DiagonalGaussianDistribution(moments)
+ return posterior
+
+ def decode(self, z):
+ z = self.post_quant_conv(z)
+ dec = self.decoder(z)
+ return dec
+
+ def forward(self, input, sample_posterior=True):
+ posterior = self.encode(input)
+ if sample_posterior:
+ z = posterior.sample()
+ else:
+ z = posterior.mode()
+ dec = self.decode(z)
+ return dec, posterior
+
+ def get_input(self, batch, k):
+ x = batch[k]
+ if len(x.shape) == 3:
+ x = x[..., None]
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+ return x
+
+ def training_step(self, inputs, optimizer_idx, global_step):
+ # inputs = self.get_input(batch, self.image_key)
+ reconstructions, posterior = self(inputs)
+
+ if optimizer_idx == 0:
+ # train encoder+decoder+logvar
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, global_step,
+ last_layer=self.get_last_layer(), split="train")
+ # self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+ # self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+ return aeloss, log_dict_ae
+
+ if optimizer_idx == 1:
+ # train the discriminator
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, global_step,
+ last_layer=self.get_last_layer(), split="train")
+
+ # self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+ # self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+ return discloss, log_dict_disc
+
+ def validation_step(self, inputs, global_step):
+ # inputs = self.get_input(batch, self.image_key)
+ reconstructions, posterior = self(inputs)
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, global_step,
+ last_layer=self.get_last_layer(), split="val")
+
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, global_step,
+ last_layer=self.get_last_layer(), split="val")
+
+ # self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+ # self.log_dict(log_dict_ae)
+ # self.log_dict(log_dict_disc)
+ return log_dict_ae, log_dict_disc
+
+ def validate_img(self, inputs):
+ reconstructions, posterior = self(inputs)
+ return reconstructions
+
+ # def configure_optimizers(self):
+ # lr = self.learning_rate
+ # opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+ # list(self.decoder.parameters())+
+ # list(self.quant_conv.parameters())+
+ # list(self.post_quant_conv.parameters()),
+ # lr=lr, betas=(0.5, 0.9))
+ # opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+ # lr=lr, betas=(0.5, 0.9))
+ # return [opt_ae, opt_disc], []
+
+ def get_last_layer(self):
+ return self.decoder.conv_out.weight
+ '''
+ @torch.no_grad()
+ def log_images(self, batch, only_inputs=False, **kwargs):
+ log = dict()
+ x = self.get_input(batch, self.image_key)
+ x = x.to(self.device)
+ if not only_inputs:
+ xrec, posterior = self(x)
+ if x.shape[1] > 3:
+ # colorize with random projection
+ assert xrec.shape[1] > 3
+ x = self.to_rgb(x)
+ xrec = self.to_rgb(xrec)
+ log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+ log["reconstructions"] = xrec
+ log["inputs"] = x
+ return log
+ def to_rgb(self, x):
+ assert self.image_key == "segmentation"
+ if not hasattr(self, "colorize"):
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+ x = nn.functional.conv2d(x, weight=self.colorize)
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+ return x
+ '''
+
+if __name__ == '__main__':
+ ddconfig = {'double_z': True,
+ 'z_channels': 4,
+ 'resolution': (240, 960),
+ 'in_channels': 3,
+ 'out_ch': 3,
+ 'ch': 128,
+ 'ch_mult': [ 1,2,4 ], # num_down = len(ch_mult)-1
+ 'num_res_blocks': 2,
+ 'attn_resolutions': [ ],
+ 'dropout': 0.0}
+ lossconfig = {'disc_start': 50001,
+ 'kl_weight': 0.000001,
+ 'disc_weight': 0.5}
+ model = AutoencoderKL(ddconfig, lossconfig, embed_dim=4,
+ ckpt_path='/pretrain_weights/model-kl-f8.ckpt', )
+ '''
+ from torch.optim import AdamW
+ optimizer = AdamW(model.parameters(), lr=0.01)
+ lr_lambda = lambda iter: (1 - iter / 1000) ** 0.95
+ lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+ for s in range(1000):
+ lr_scheduler.step()
+ cur_lr = optimizer.param_groups[0]['lr']
+ print(cur_lr)
+ '''
+ x = torch.rand(1, 3, 240, 960)
+ with torch.no_grad():
+ y = model(x)
+ pass
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/imagenet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/imagenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f0dda683f940e7aa12780a6adf1506f62bfdbb4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/imagenet.py
@@ -0,0 +1,395 @@
+import os, yaml, pickle, shutil, tarfile, glob
+import cv2
+import custom_albumentations as albumentations
+import PIL
+import numpy as np
+import torchvision.transforms.functional as TF
+# from omegaconf import OmegaConf
+from functools import partial
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset, Subset
+
+import taming.data.utils as tdu
+from custom_controlnet_aux.diffusion_edge.taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
+from custom_controlnet_aux.diffusion_edge.taming.data.imagenet import ImagePaths
+
+# from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
+
+
+def synset2idx(path_to_yaml="data/index_synset.yaml"):
+ with open(path_to_yaml) as f:
+ di2s = yaml.load(f)
+ return dict((v,k) for k,v in di2s.items())
+
+
+class ImageNetBase(Dataset):
+ def __init__(self, config=None):
+ self.config = config
+ # if not type(self.config)==dict:
+ # self.config = OmegaConf.to_container(self.config)
+ self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
+ self.process_images = True # if False we skip loading & processing images and self.data contains filepaths
+ self._prepare()
+ self._prepare_synset_to_human()
+ self._prepare_idx_to_synset()
+ self._prepare_human_to_integer_label()
+ self._load()
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, i):
+ return self.data[i]
+
+ def _prepare(self):
+ raise NotImplementedError()
+
+ def _filter_relpaths(self, relpaths):
+ ignore = set([
+ "n06596364_9591.JPEG",
+ ])
+ relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
+ if "sub_indices" in self.config:
+ indices = str_to_indices(self.config["sub_indices"])
+ synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings
+ self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
+ files = []
+ for rpath in relpaths:
+ syn = rpath.split("/")[0]
+ if syn in synsets:
+ files.append(rpath)
+ return files
+ else:
+ return relpaths
+
+ def _prepare_synset_to_human(self):
+ SIZE = 2655750
+ URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
+ self.human_dict = os.path.join(self.root, "synset_human.txt")
+ if (not os.path.exists(self.human_dict) or
+ not os.path.getsize(self.human_dict)==SIZE):
+ download(URL, self.human_dict)
+
+ def _prepare_idx_to_synset(self):
+ URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
+ self.idx2syn = os.path.join(self.root, "index_synset.yaml")
+ if (not os.path.exists(self.idx2syn)):
+ download(URL, self.idx2syn)
+
+ def _prepare_human_to_integer_label(self):
+ URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
+ self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
+ if (not os.path.exists(self.human2integer)):
+ download(URL, self.human2integer)
+ with open(self.human2integer, "r") as f:
+ lines = f.read().splitlines()
+ assert len(lines) == 1000
+ self.human2integer_dict = dict()
+ for line in lines:
+ value, key = line.split(":")
+ self.human2integer_dict[key] = int(value)
+
+ def _load(self):
+ with open(self.txt_filelist, "r") as f:
+ self.relpaths = f.read().splitlines()
+ l1 = len(self.relpaths)
+ self.relpaths = self._filter_relpaths(self.relpaths)
+ print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
+
+ self.synsets = [p.split("/")[0] for p in self.relpaths]
+ self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
+
+ unique_synsets = np.unique(self.synsets)
+ class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
+ if not self.keep_orig_class_label:
+ self.class_labels = [class_dict[s] for s in self.synsets]
+ else:
+ self.class_labels = [self.synset2idx[s] for s in self.synsets]
+
+ with open(self.human_dict, "r") as f:
+ human_dict = f.read().splitlines()
+ human_dict = dict(line.split(maxsplit=1) for line in human_dict)
+
+ self.human_labels = [human_dict[s] for s in self.synsets]
+
+ labels = {
+ "relpath": np.array(self.relpaths),
+ "synsets": np.array(self.synsets),
+ "class_label": np.array(self.class_labels),
+ "human_label": np.array(self.human_labels),
+ }
+
+ if self.process_images:
+ # self.size = retrieve(self.config, "size", default=256)
+ self.size = self.config.get("size", default=256)
+ self.data = ImagePaths(self.abspaths,
+ labels=labels,
+ size=self.size,
+ random_crop=self.random_crop,
+ )
+ else:
+ self.data = self.abspaths
+
+
+class ImageNetTrain(ImageNetBase):
+ NAME = "ILSVRC2012_train"
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+ AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
+ FILES = [
+ "ILSVRC2012_img_train.tar",
+ ]
+ SIZES = [
+ 147897477120,
+ ]
+
+ def __init__(self, process_images=True, data_root=None, **kwargs):
+ self.process_images = process_images
+ self.data_root = data_root
+ super().__init__(**kwargs)
+
+ def _prepare(self):
+ if self.data_root:
+ self.root = os.path.join(self.data_root, self.NAME)
+ else:
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+
+ self.datadir = os.path.join(self.root, "data")
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
+ self.expected_length = 1281167
+ # self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop", default=True)
+ self.random_crop = self.config.get("random_crop", default=True)
+ if not tdu.is_prepared(self.root):
+ # prep
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
+
+ datadir = self.datadir
+ if not os.path.exists(datadir):
+ path = os.path.join(self.root, self.FILES[0])
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+ import academictorrents as at
+ atpath = at.get(self.AT_HASH, datastore=self.root)
+ assert atpath == path
+
+ print("Extracting {} to {}".format(path, datadir))
+ os.makedirs(datadir, exist_ok=True)
+ with tarfile.open(path, "r:") as tar:
+ tar.extractall(path=datadir)
+
+ print("Extracting sub-tars.")
+ subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
+ for subpath in tqdm(subpaths):
+ subdir = subpath[:-len(".tar")]
+ os.makedirs(subdir, exist_ok=True)
+ with tarfile.open(subpath, "r:") as tar:
+ tar.extractall(path=subdir)
+
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+ filelist = sorted(filelist)
+ filelist = "\n".join(filelist)+"\n"
+ with open(self.txt_filelist, "w") as f:
+ f.write(filelist)
+
+ tdu.mark_prepared(self.root)
+
+
+class ImageNetValidation(ImageNetBase):
+ NAME = "ILSVRC2012_validation"
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+ AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
+ VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
+ FILES = [
+ "ILSVRC2012_img_val.tar",
+ "validation_synset.txt",
+ ]
+ SIZES = [
+ 6744924160,
+ 1950000,
+ ]
+
+ def __init__(self, process_images=True, data_root=None, **kwargs):
+ self.data_root = data_root
+ self.process_images = process_images
+ super().__init__(**kwargs)
+
+ def _prepare(self):
+ if self.data_root:
+ self.root = os.path.join(self.data_root, self.NAME)
+ else:
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+ self.datadir = os.path.join(self.root, "data")
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
+ self.expected_length = 50000
+ # self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop", default=False)
+ self.random_crop = self.config.get("random_crop", default=False)
+ if not tdu.is_prepared(self.root):
+ # prep
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
+
+ datadir = self.datadir
+ if not os.path.exists(datadir):
+ path = os.path.join(self.root, self.FILES[0])
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+ import academictorrents as at
+ atpath = at.get(self.AT_HASH, datastore=self.root)
+ assert atpath == path
+
+ print("Extracting {} to {}".format(path, datadir))
+ os.makedirs(datadir, exist_ok=True)
+ with tarfile.open(path, "r:") as tar:
+ tar.extractall(path=datadir)
+
+ vspath = os.path.join(self.root, self.FILES[1])
+ if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
+ download(self.VS_URL, vspath)
+
+ with open(vspath, "r") as f:
+ synset_dict = f.read().splitlines()
+ synset_dict = dict(line.split() for line in synset_dict)
+
+ print("Reorganizing into synset folders")
+ synsets = np.unique(list(synset_dict.values()))
+ for s in synsets:
+ os.makedirs(os.path.join(datadir, s), exist_ok=True)
+ for k, v in synset_dict.items():
+ src = os.path.join(datadir, k)
+ dst = os.path.join(datadir, v)
+ shutil.move(src, dst)
+
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+ filelist = sorted(filelist)
+ filelist = "\n".join(filelist)+"\n"
+ with open(self.txt_filelist, "w") as f:
+ f.write(filelist)
+
+ tdu.mark_prepared(self.root)
+
+
+
+class ImageNetSR(Dataset):
+ def __init__(self, size=None,
+ degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
+ random_crop=True):
+ """
+ Imagenet Superresolution Dataloader
+ Performs following ops in order:
+ 1. crops a crop of size s from image either as random or center crop
+ 2. resizes crop to size with cv2.area_interpolation
+ 3. degrades resized crop with degradation_fn
+
+ :param size: resizing to size after cropping
+ :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
+ :param downscale_f: Low Resolution Downsample factor
+ :param min_crop_f: determines crop size s,
+ where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
+ :param max_crop_f: ""
+ :param data_root:
+ :param random_crop:
+ """
+ self.base = self.get_base()
+ assert size
+ assert (size / downscale_f).is_integer()
+ self.size = size
+ self.LR_size = int(size / downscale_f)
+ self.min_crop_f = min_crop_f
+ self.max_crop_f = max_crop_f
+ assert(max_crop_f <= 1.)
+ self.center_crop = not random_crop
+
+ self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
+
+ self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
+
+ if degradation == "bsrgan":
+ self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
+
+ elif degradation == "bsrgan_light":
+ self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
+
+ else:
+ interpolation_fn = {
+ "cv_nearest": cv2.INTER_NEAREST,
+ "cv_bilinear": cv2.INTER_LINEAR,
+ "cv_bicubic": cv2.INTER_CUBIC,
+ "cv_area": cv2.INTER_AREA,
+ "cv_lanczos": cv2.INTER_LANCZOS4,
+ "pil_nearest": PIL.Image.NEAREST,
+ "pil_bilinear": PIL.Image.BILINEAR,
+ "pil_bicubic": PIL.Image.BICUBIC,
+ "pil_box": PIL.Image.BOX,
+ "pil_hamming": PIL.Image.HAMMING,
+ "pil_lanczos": PIL.Image.LANCZOS,
+ }[degradation]
+
+ self.pil_interpolation = degradation.startswith("pil_")
+
+ if self.pil_interpolation:
+ self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
+
+ else:
+ self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
+ interpolation=interpolation_fn)
+
+ def __len__(self):
+ return len(self.base)
+
+ def __getitem__(self, i):
+ example = self.base[i]
+ image = Image.open(example["file_path_"])
+
+ if not image.mode == "RGB":
+ image = image.convert("RGB")
+
+ image = np.array(image).astype(np.uint8)
+
+ min_side_len = min(image.shape[:2])
+ crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
+ crop_side_len = int(crop_side_len)
+
+ if self.center_crop:
+ self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
+
+ else:
+ self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
+
+ image = self.cropper(image=image)["image"]
+ image = self.image_rescaler(image=image)["image"]
+
+ if self.pil_interpolation:
+ image_pil = PIL.Image.fromarray(image)
+ LR_image = self.degradation_process(image_pil)
+ LR_image = np.array(LR_image).astype(np.uint8)
+
+ else:
+ LR_image = self.degradation_process(image=image)["image"]
+
+ example["image"] = (image/127.5 - 1.0).astype(np.float32)
+ example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
+
+ return example
+
+
+class ImageNetSRTrain(ImageNetSR):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def get_base(self):
+ with open("data/imagenet_train_hr_indices.p", "rb") as f:
+ indices = pickle.load(f)
+ dset = ImageNetTrain(process_images=False,)
+ return Subset(dset, indices)
+
+
+class ImageNetSRValidation(ImageNetSR):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ def get_base(self):
+ with open("data/imagenet_val_hr_indices.p", "rb") as f:
+ indices = pickle.load(f)
+ dset = ImageNetValidation(process_images=False,)
+ return Subset(dset, indices)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/loss.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..920e1dd004df0b976c55441c82f911abb5f4d085
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/loss.py
@@ -0,0 +1,113 @@
+import torch
+import torch.nn as nn
+import sys
+# .path.append()
+from custom_controlnet_aux.diffusion_edge.taming.modules.losses.vqperceptual import *
+
+
+class LPIPSWithDiscriminator(nn.Module):
+ def __init__(self, *, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
+ disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+ perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+ disc_loss="hinge"):
+
+ super().__init__()
+ assert disc_loss in ["hinge", "vanilla"]
+ self.kl_weight = kl_weight
+ self.pixel_weight = pixelloss_weight
+ self.perceptual_loss = LPIPS().eval()
+ self.perceptual_weight = perceptual_weight
+ # output log variance
+ self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+
+ self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
+ n_layers=disc_num_layers,
+ use_actnorm=use_actnorm
+ ).apply(weights_init)
+ self.discriminator_iter_start = disc_start
+ self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+ self.disc_factor = disc_factor
+ self.discriminator_weight = disc_weight
+ self.disc_conditional = disc_conditional
+
+ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+ if last_layer is not None:
+ nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+ g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+ else:
+ nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+ g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+
+ d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+ d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+ d_weight = d_weight * self.discriminator_weight
+ return d_weight
+
+ def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
+ global_step, last_layer=None, cond=None, split="train",
+ weights=None):
+ rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) + \
+ F.mse_loss(inputs, reconstructions, reduction="none")
+ if self.perceptual_weight > 0:
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
+
+ nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+ weighted_nll_loss = nll_loss
+ if weights is not None:
+ weighted_nll_loss = weights*nll_loss
+ weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+ nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+ kl_loss = posteriors.kl()
+ kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+
+ # now the GAN part
+ if optimizer_idx == 0:
+ # generator update
+ if cond is None:
+ assert not self.disc_conditional
+ logits_fake = self.discriminator(reconstructions.contiguous())
+ else:
+ assert self.disc_conditional
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+ g_loss = -torch.mean(logits_fake)
+
+ if self.disc_factor > 0.0:
+ try:
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+ except RuntimeError:
+ assert not self.training
+ d_weight = torch.tensor(0.0)
+ else:
+ d_weight = torch.tensor(0.0)
+
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+ loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
+
+ log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
+ "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
+ "{}/rec_loss".format(split): rec_loss.detach().mean(),
+ "{}/d_weight".format(split): d_weight.detach(),
+ "{}/disc_factor".format(split): torch.tensor(disc_factor),
+ "{}/g_loss".format(split): g_loss.detach().mean(),
+ }
+ return loss, log
+
+ if optimizer_idx == 1:
+ # second pass for discriminator update
+ if cond is None:
+ logits_real = self.discriminator(inputs.contiguous().detach())
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
+ else:
+ logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+ d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+
+ log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+ "{}/logits_real".format(split): logits_real.detach().mean(),
+ "{}/logits_fake".format(split): logits_fake.detach().mean()
+ }
+ return d_loss, log
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/mask_cond_unet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/mask_cond_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1673ad829c5501484bf5f80fde8355ed2728b4e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/mask_cond_unet.py
@@ -0,0 +1,1009 @@
+import fvcore.common.config
+import torch
+import torch.nn as nn
+import math
+import torch.nn.functional as F
+from functools import partial
+from einops import rearrange, reduce
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.efficientnet import efficientnet_b7, EfficientNet_B7_Weights
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.resnet import resnet101, ResNet101_Weights
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.swin_transformer import swin_b, Swin_B_Weights
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.vgg import vgg16, VGG16_Weights
+
+from custom_controlnet_aux.util import custom_torch_download
+# from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.wcc import fft
+### Compared to unet4:
+# 1. add FFT-Conv on the mid feature.
+######## Attention Layer ##########
+
+class PositionEmbeddingSine(nn.Module):
+ """
+ This is a more standard version of the position embedding, very similar to the one
+ used by the Attention is all you need paper, generalized to work on images.
+ """
+
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+ super().__init__()
+ self.num_pos_feats = num_pos_feats
+ self.temperature = temperature
+ self.normalize = normalize
+ if scale is not None and normalize is False:
+ raise ValueError("normalize should be True if scale is passed")
+ if scale is None:
+ scale = 2 * math.pi
+ self.scale = scale
+ # self.class_token_pos = nn.Parameter(torch.zeros(1, 1, num_pos_feats * 2))
+ # self.class_token_pos
+
+ def forward(self, x):
+ # x: b, h, w, d
+ num_feats = x.shape[3]
+ num_pos_feats = num_feats // 2
+ # mask = tensor_list.mask
+ mask = torch.zeros(x.shape[0], x.shape[1], x.shape[2], device=x.device).to(torch.bool)
+ batch = mask.shape[0]
+ assert mask is not None
+ not_mask = ~mask
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
+ if self.normalize:
+ eps = 1e-5
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=x.device)
+ dim_t = self.temperature ** (2 * (dim_t // 2) / num_pos_feats)
+
+ pos_x = x_embed[:, :, :, None] / dim_t
+ pos_y = y_embed[:, :, :, None] / dim_t
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ # pos = torch.cat((pos_y, pos_x), dim=3).flatten(1, 2)
+ pos = torch.cat((pos_y, pos_x), dim=3).contiguous()
+ '''
+ pos_x: b ,h, w, d//2
+ pos_y: b, h, w, d//2
+ pos: b, h, w, d
+ '''
+ return pos
+
+class PositionEmbeddingLearned(nn.Module):
+ """
+ Absolute pos embedding, learned.
+ """
+ def __init__(self, feature_size, num_pos_feats=256):
+ super().__init__()
+ self.row_embed = nn.Embedding(feature_size[0], num_pos_feats)
+ self.col_embed = nn.Embedding(feature_size[1], num_pos_feats)
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ nn.init.uniform_(self.row_embed.weight)
+ nn.init.uniform_(self.col_embed.weight)
+
+ def forward(self, x):
+ h, w = x.shape[-2:]
+ i = torch.arange(w, device=x.device)
+ j = torch.arange(h, device=x.device)
+ x_emb = self.col_embed(i)
+ y_emb = self.row_embed(j)
+ pos = torch.cat([
+ x_emb.unsqueeze(0).repeat(h, 1, 1),
+ y_emb.unsqueeze(1).repeat(1, w, 1),
+ ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+ return torch.cat([x, pos], dim=1)
+
+
+class ChannelAttention(nn.Module):
+ def __init__(self, in_planes, ratio=8):
+ super(ChannelAttention, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
+ self.max_pool = nn.AdaptiveMaxPool2d(1)
+
+ self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
+ self.relu1 = nn.ReLU()
+ self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
+
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x):
+ avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
+ max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
+ out = avg_out + max_out
+ return self.sigmoid(out) * x
+
+class SpatialAtt(nn.Module):
+ def __init__(self, in_dim):
+ super(SpatialAtt, self).__init__()
+ self.map = nn.Conv2d(in_dim, 1, 1)
+ self.q_conv = nn.Conv2d(1, 1, 1)
+ self.k_conv = nn.Conv2d(1, 1, 1)
+ self.activation = nn.Softsign()
+
+ def forward(self, x):
+ b, _, h, w = x.shape
+ att = self.map(x) # b, 1, h, w
+ q = self.q_conv(att) # b, 1, h, w
+ q = rearrange(q, 'b c h w -> b (h w) c')
+ k = self.k_conv(att)
+ k = rearrange(k, 'b c h w -> b c (h w)')
+ att = rearrange(att, 'b c h w -> b (h w) c')
+ att = F.softmax(q @ k, dim=-1) @ att # b, hw, 1
+ att = att.reshape(b, 1, h, w)
+ return self.activation(att) * x
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ # self.fc1 = nn.Linear(in_features, hidden_features)
+ self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+ self.act = act_layer()
+ self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+ # self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+
+class BasicAttetnionLayer(nn.Module):
+ def __init__(self, embed_dim=128, nhead=8, ffn_dim=512, window_size1=[4, 4],
+ window_size2=[1, 1], dropout=0.1):
+ super().__init__()
+ self.window_size1 = window_size1
+ self.window_size2 = window_size2
+ self.avgpool_q = nn.AvgPool2d(kernel_size=window_size1)
+ self.avgpool_k = nn.AvgPool2d(kernel_size=window_size2)
+ self.softmax = nn.Softmax(dim=-1)
+ self.nhead = nhead
+
+ self.q_lin = nn.Linear(embed_dim, embed_dim)
+ self.k_lin = nn.Linear(embed_dim, embed_dim)
+ self.v_lin = nn.Linear(embed_dim, embed_dim)
+
+ self.mlp = Mlp(in_features=embed_dim, hidden_features=ffn_dim, drop=dropout)
+ self.pos_enc = PositionEmbeddingSine(embed_dim)
+ self.concat_conv = nn.Conv2d(2 * embed_dim, embed_dim, 1)
+ self.gn = nn.GroupNorm(8, embed_dim)
+
+ self.out_conv = nn.Conv2d(embed_dim, embed_dim, 1)
+ self.init_weights()
+
+ def init_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0.)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1.)
+ nn.init.constant_(m.bias, 0.)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.weight, 1.)
+ nn.init.constant_(m.bias, 0.)
+ elif isinstance(m, nn.Linear):
+ nn.init.xavier_normal_(m.weight)
+ nn.init.constant_(m.bias, 0.)
+
+ def forward(self, x1, x2): # x1 for q (conditional input), x2 for k,v
+ B, C1, H1, W1 = x1.shape
+ _, C2, H2, W2 = x2.shape
+ # x1 = x1.permute(0, 2, 3, 1).contiguous() # B, H1, W1, C1
+ shortcut = x2 + self.concat_conv(torch.cat(
+ [F.interpolate(x1, size=(H2, W2), mode='bilinear', align_corners=True),
+ x2], dim=1))
+ shortcut = self.gn(shortcut)
+ pad_l = pad_t = 0
+ pad_r = (self.window_size1[1] - W1 % self.window_size1[1]) % self.window_size1[1]
+ pad_b = (self.window_size1[0] - H1 % self.window_size1[0]) % self.window_size1[0]
+ x1 = F.pad(x1, (pad_l, pad_r, pad_t, pad_b, 0, 0))
+ _, _, H1p, W1p = x1.shape
+ # x2 = x2.permute(0, 2, 3, 1).contiguous() # B, H2, W2, C2
+ pad_l = pad_t = 0
+ pad_r = (self.window_size2[1] - W2 % self.window_size2[1]) % self.window_size2[1]
+ pad_b = (self.window_size2[0] - H2 % self.window_size2[0]) % self.window_size2[0]
+ x2 = F.pad(x2, (pad_l, pad_r, pad_t, pad_b, 0, 0))
+ _, _, H2p, W2p = x2.shape
+ # x1g = x1 #B, C1, H1p, W1p
+ # x2g = x2 #B, C2, H2p, W2p
+ x1_s = self.avgpool_q(x1)
+ qg = self.avgpool_q(x1).permute(0, 2, 3, 1).contiguous()
+ qg = qg + self.pos_enc(qg)
+ qg= qg.view(B, -1, C2)
+ kg = self.avgpool_k(x2).permute(0, 2, 3, 1).contiguous()
+ kg = kg + self.pos_enc(kg)
+ kg = kg.view(B, -1, C1)
+ num_window_q = qg.shape[1]
+ num_window_k = kg.shape[1]
+ qg = self.q_lin(qg).reshape(B, num_window_q, self.nhead, C1 // self.nhead).permute(0, 2, 1,
+ 3).contiguous()
+ kg2 = self.k_lin(kg).reshape(B, num_window_k, self.nhead, C1 // self.nhead).permute(0, 2, 1,
+ 3).contiguous()
+ vg = self.v_lin(kg).reshape(B, num_window_k, self.nhead, C1 // self.nhead).permute(0, 2, 1,
+ 3).contiguous()
+ kg = kg2
+ attn = (qg @ kg.transpose(-2, -1))
+ attn = self.softmax(attn)
+ qg = (attn @ vg).transpose(1, 2).reshape(B, num_window_q, C1)
+ qg = qg.transpose(1, 2).reshape(B, C1, H1p // self.window_size1[0], W1p // self.window_size1[1])
+ # qg = F.interpolate(qg, size=(H1p, W1p), mode='bilinear', align_corners=False)
+ x1_s = x1_s + qg
+ x1_s = x1_s + self.mlp(x1_s)
+ x1_s = F.interpolate(x1_s, size=(H2, W2), mode='bilinear', align_corners=True)
+ x1_s = shortcut + self.out_conv(x1_s)
+ # x1_s = self.out_norm(x1_s)
+ return x1_s
+
+class RelationNet(nn.Module):
+ def __init__(self, in_channel1=128, in_channel2=128, nhead=8, layers=3, embed_dim=128, ffn_dim=512,
+ window_size1= [4, 4], window_size2=[1, 1]):
+ # self.attention = BasicAttetnionLayer(embed_dim=embed_dim, nhead=nhead, ffn_dim=ffn_dim,
+ # window_size1=window_size1, window_size2=window_size2, dropout=0.1)
+ super().__init__()
+ self.layers = layers
+ self.input_conv1 = nn.Sequential(
+ nn.Conv2d(in_channel1, embed_dim, 1),
+ nn.BatchNorm2d(embed_dim, momentum=0.03, eps=0.001),
+ )
+ self.input_conv2 = nn.Sequential(
+ nn.Conv2d(in_channel2, embed_dim, 1),
+ nn.BatchNorm2d(embed_dim, momentum=0.03, eps=0.001),
+ )
+ # self.input_conv1 = ConvModule(in_channel1,
+ # embed_dim,
+ # 1,
+ # norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ # act_cfg=None)
+ # self.input_conv2 = ConvModule(in_channel2,
+ # embed_dim,
+ # 1,
+ # norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ # act_cfg=None)
+ # self.input_conv2 = nn.Linear(in_channel2, embed_dim)
+ self.attentions = nn.ModuleList()
+ for i in range(layers):
+ self.attentions.append(
+ BasicAttetnionLayer(embed_dim=embed_dim, nhead=nhead, ffn_dim=ffn_dim,
+ window_size1=window_size1, window_size2=window_size2, dropout=0.1)
+ )
+
+ def forward(self, cond, feat):
+ # cluster = cluster.unsqueeze(0).repeat(feature.shape[0], 1, 1, 1)
+ cond = self.input_conv1(cond)
+ feat = self.input_conv2(feat)
+ for att in self.attentions:
+ feat = att(cond, feat)
+ return feat
+
+
+
+################# U-Net model defenition ####################
+
+def exists(x):
+ return x is not None
+
+def default(val, d):
+ if exists(val):
+ return val
+ return d() if callable(d) else d
+
+def identity(t, *args, **kwargs):
+ return t
+
+def cycle(dl):
+ while True:
+ for data in dl:
+ yield data
+
+def has_int_squareroot(num):
+ return (math.sqrt(num) ** 2) == num
+
+def num_to_groups(num, divisor):
+ groups = num // divisor
+ remainder = num % divisor
+ arr = [divisor] * groups
+ if remainder > 0:
+ arr.append(remainder)
+ return arr
+
+def convert_image_to_fn(img_type, image):
+ if image.mode != img_type:
+ return image.convert(img_type)
+ return image
+
+# normalization functions
+
+def normalize_to_neg_one_to_one(img):
+ return img * 2 - 1
+
+def unnormalize_to_zero_to_one(t):
+ return (t + 1) * 0.5
+
+# small helper modules
+
+class Residual(nn.Module):
+ def __init__(self, fn):
+ super().__init__()
+ self.fn = fn
+
+ def forward(self, x, *args, **kwargs):
+ return self.fn(x, *args, **kwargs) + x
+
+def Upsample(dim, dim_out = None):
+ return nn.Sequential(
+ nn.Upsample(scale_factor = 2, mode = 'nearest'),
+ nn.Conv2d(dim, default(dim_out, dim), 3, padding = 1)
+ )
+
+def Downsample(dim, dim_out = None):
+ return nn.Conv2d(dim, default(dim_out, dim), 4, 2, 1)
+
+class WeightStandardizedConv2d(nn.Conv2d):
+ """
+ https://arxiv.org/abs/1903.10520
+ weight standardization purportedly works synergistically with group normalization
+ """
+ def forward(self, x):
+ eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+
+ weight = self.weight
+ mean = reduce(weight, 'o ... -> o 1 1 1', 'mean')
+ var = reduce(weight, 'o ... -> o 1 1 1', partial(torch.var, unbiased = False))
+ normalized_weight = (weight - mean) * (var + eps).rsqrt()
+
+ return F.conv2d(x, normalized_weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+class LayerNorm(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+
+ def forward(self, x):
+ eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+ var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
+ mean = torch.mean(x, dim = 1, keepdim = True)
+ return (x - mean) * (var + eps).rsqrt() * self.g
+
+class PreNorm(nn.Module):
+ def __init__(self, dim, fn):
+ super().__init__()
+ self.fn = fn
+ self.norm = LayerNorm(dim)
+
+ def forward(self, x):
+ x = self.norm(x)
+ return self.fn(x)
+
+# sinusoidal positional embeds
+
+class SinusoidalPosEmb(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.dim = dim
+
+ def forward(self, x):
+ device = x.device
+ half_dim = self.dim // 2
+ emb = math.log(10000) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+ emb = x[:, None] * emb[None, :]
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+ return emb
+
+class GaussianFourierProjection(nn.Module):
+ """Gaussian Fourier embeddings for noise levels."""
+
+ def __init__(self, embedding_size=256, scale=1.0):
+ super().__init__()
+ self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+
+ def forward(self, x):
+ x_proj = x[:, None] * self.W[None, :] * 2 * math.pi
+ return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+
+class RandomOrLearnedSinusoidalPosEmb(nn.Module):
+ """ following @crowsonkb 's lead with random (learned optional) sinusoidal pos emb """
+ """ https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """
+
+ def __init__(self, dim, is_random = False):
+ super().__init__()
+ assert (dim % 2) == 0
+ half_dim = dim // 2
+ self.weights = nn.Parameter(torch.randn(half_dim), requires_grad = not is_random)
+
+ def forward(self, x):
+ x = rearrange(x, 'b -> b 1')
+ freqs = x * rearrange(self.weights, 'd -> 1 d') * 2 * math.pi
+ fouriered = torch.cat((freqs.sin(), freqs.cos()), dim = -1)
+ fouriered = torch.cat((x, fouriered), dim = -1)
+ return fouriered
+
+# building block modules
+
+class Block(nn.Module):
+ def __init__(self, dim, dim_out, groups = 8):
+ super().__init__()
+ self.proj = WeightStandardizedConv2d(dim, dim_out, 3, padding = 1)
+ self.norm = nn.GroupNorm(groups, dim_out)
+ self.act = nn.SiLU()
+
+ def forward(self, x, scale_shift = None):
+ x = self.proj(x)
+ x = self.norm(x)
+
+ if exists(scale_shift):
+ scale, shift = scale_shift
+ x = x * (scale + 1) + shift
+
+ x = self.act(x)
+ return x
+
+class BlockFFT(nn.Module):
+ def __init__(self, dim, h, w, groups = 8):
+ super().__init__()
+ # self.proj = WeightStandardizedConv2d(dim, dim_out, 3, padding = 1)
+ self.complex_weight = nn.Parameter(torch.randn(dim, h, w//2+1, 2, dtype=torch.float32) * 0.02)
+ # self.complex_weight = nn.Parameter(torch.normal(mean=0, std=0.01, size=(dim, h, w // 2 + 1, 2), dtype=torch.float32))
+ # self.norm = nn.GroupNorm(groups, dim)
+ # self.act = nn.SiLU()
+
+ def forward(self, x, scale_shift = None):
+ B, C, H, W = x.shape
+ x = torch.fft.rfft2(x, dim=(2, 3), norm='ortho')
+ x = x * torch.view_as_complex(self.complex_weight)
+ x = torch.fft.irfft2(x, s=(H, W), dim=(2, 3), norm='ortho')
+ x = x.reshape(B, C, H, W)
+
+ return x
+
+class ResnetBlock(nn.Module):
+ def __init__(self, dim, dim_out, *, time_emb_dim = None, groups = 8):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.SiLU(),
+ nn.Linear(time_emb_dim, dim_out * 2)
+ ) if exists(time_emb_dim) else None
+
+ self.block1 = Block(dim, dim_out, groups = groups)
+ self.block2 = Block(dim_out, dim_out, groups = groups)
+ self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+
+ def forward(self, x, time_emb = None):
+
+ scale_shift = None
+ if exists(self.mlp) and exists(time_emb):
+ time_emb = self.mlp(time_emb)
+ time_emb = rearrange(time_emb, 'b c -> b c 1 1')
+ scale_shift = time_emb.chunk(2, dim = 1)
+
+ h = self.block1(x, scale_shift = scale_shift)
+
+ h = self.block2(h)
+
+ return h + self.res_conv(x)
+
+class ResnetBlockFFT(nn.Module):
+ def __init__(self, dim, dim_out, h, w, *, time_emb_dim = None, groups = 8):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.SiLU(),
+ nn.Linear(time_emb_dim, dim_out * 2)
+ ) if exists(time_emb_dim) else None
+
+ self.block1 = Block(dim, dim_out, groups = groups)
+ self.block2 = BlockFFT(dim_out, h, w, groups = groups)
+ self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+
+ def forward(self, x, time_emb = None):
+
+ scale_shift = None
+ if exists(self.mlp) and exists(time_emb):
+ time_emb = self.mlp(time_emb)
+ time_emb = rearrange(time_emb, 'b c -> b c 1 1')
+ scale_shift = time_emb.chunk(2, dim = 1)
+
+ h = self.block1(x, scale_shift = scale_shift)
+
+ h = self.block2(h)
+
+ return h + self.res_conv(x)
+
+class ResnetDownsampleBlock(nn.Module):
+ def __init__(self, dim, dim_out, *, time_emb_dim = None, groups = 8):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.SiLU(),
+ nn.Linear(time_emb_dim, dim_out * 2)
+ ) if exists(time_emb_dim) else None
+
+ self.block1 = Block(dim, dim_out, groups = groups)
+ self.block2 = nn.Sequential(
+ WeightStandardizedConv2d(dim_out, dim_out, 3, stride=2, padding=1),
+ nn.GroupNorm(groups, dim_out),
+ nn.SiLU()
+ )
+ self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+
+ def forward(self, x, time_emb = None):
+
+ scale_shift = None
+ if exists(self.mlp) and exists(time_emb):
+ time_emb = self.mlp(time_emb)
+ time_emb = rearrange(time_emb, 'b c -> b c 1 1')
+ scale_shift = time_emb.chunk(2, dim = 1)
+
+ h = self.block1(x, scale_shift = scale_shift)
+
+ h = self.block2(h)
+
+ return h + self.res_conv(
+ F.interpolate(x, size=h.shape[-2:], mode="bilinear")
+ )
+
+class LinearAttention(nn.Module):
+ def __init__(self, dim, heads = 4, dim_head = 32):
+ super().__init__()
+ self.scale = dim_head ** -0.5
+ self.heads = heads
+ hidden_dim = dim_head * heads
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+
+ self.to_out = nn.Sequential(
+ nn.Conv2d(hidden_dim, dim, 1),
+ LayerNorm(dim)
+ )
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+ qkv = self.to_qkv(x).chunk(3, dim = 1)
+ q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h = self.heads), qkv)
+
+ q = q.softmax(dim = -2)
+ k = k.softmax(dim = -1)
+
+ q = q * self.scale
+ v = v / (h * w)
+
+ context = torch.einsum('b h d n, b h e n -> b h d e', k, v)
+
+ out = torch.einsum('b h d e, b h d n -> b h e n', context, q)
+ out = rearrange(out, 'b h c (x y) -> b (h c) x y', h = self.heads, x = h, y = w)
+ return self.to_out(out)
+
+class Attention(nn.Module):
+ def __init__(self, dim, heads = 4, dim_head = 32):
+ super().__init__()
+ self.scale = dim_head ** -0.5
+ self.heads = heads
+ hidden_dim = dim_head * heads
+
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+ self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+ qkv = self.to_qkv(x).chunk(3, dim=1)
+ q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h=self.heads), qkv)
+
+ q = q * self.scale
+
+ sim = torch.einsum('b h d i, b h d j -> b h i j', q, k)
+ attn = sim.softmax(dim=-1)
+ out = torch.einsum('b h i j, b h d j -> b h i d', attn, v)
+
+ out = rearrange(out, 'b h (x y) d -> b (h d) x y', x=h, y=w)
+ return self.to_out(out)
+
+
+class ConditionEncoder(nn.Module):
+ def __init__(self,
+ down_dim_mults=(2, 4, 8),
+ dim=64,
+ in_dim=1,
+ out_dim=64):
+ super(ConditionEncoder, self).__init__()
+ self.init_conv = nn.Sequential(
+ nn.Conv2d(in_dim, dim, kernel_size=3, stride=1, padding=1),
+ nn.GroupNorm(num_groups=min(dim // 4, 8), num_channels=dim),
+ )
+ self.num_resolutions = len(down_dim_mults)
+ self.downs = nn.ModuleList()
+ in_mults = (1,) + tuple(down_dim_mults[:-1])
+ in_dims = [mult*dim for mult in in_mults]
+ out_dims = [mult*dim for mult in down_dim_mults]
+ for i_level in range(self.num_resolutions):
+ block_in = in_dims[i_level]
+ block_out = out_dims[i_level]
+ self.downs.append(ResnetDownsampleBlock(dim=block_in,
+ dim_out=block_out))
+ if self.num_resolutions < 1:
+ self.out_conv = nn.Conv2d(dim, out_dim, 1)
+ else:
+ self.out_conv = nn.Conv2d(out_dims[-1], out_dim, 1)
+
+ def forward(self, x):
+ x = self.init_conv(x)
+ for down_layer in self.downs:
+ x = down_layer(x)
+ x = self.out_conv(x)
+ return x
+
+
+class Unet(nn.Module):
+ def __init__(
+ self,
+ dim,
+ init_dim=None,
+ out_dim=None,
+ dim_mults=(1, 2, 4, 8),
+ cond_in_dim=1,
+ cond_dim=64,
+ cond_dim_mults=(2, 4, 8),
+ channels=1,
+ out_mul=1,
+ self_condition=False,
+ resnet_block_groups=8,
+ learned_variance=False,
+ learned_sinusoidal_cond=False,
+ random_fourier_features=False,
+ learned_sinusoidal_dim=16,
+ window_sizes1=[[16, 16], [8, 8], [4, 4], [2, 2]],
+ window_sizes2=[[16, 16], [8, 8], [4, 4], [2, 2]],
+ fourier_scale=16,
+ ckpt_path=None,
+ ignore_keys=[],
+ cfg={},
+ **kwargs
+ ):
+ super().__init__()
+
+ # determine dimensions
+ self.cond_pe = cfg.get('cond_pe', False)
+ num_pos_feats = cfg.num_pos_feats if self.cond_pe else 0
+ self.channels = channels
+ self.self_condition = self_condition
+ input_channels = channels * (2 if self_condition else 1)
+
+ init_dim = default(init_dim, dim)
+ # self.init_conv_mask = nn.Sequential(
+ # nn.Conv2d(cond_in_dim, cond_dim, 3, padding=1),
+ # nn.GroupNorm(num_groups=min(init_dim // 4, 8), num_channels=init_dim),
+ # nn.SiLU(),
+ # nn.Conv2d(cond_dim, cond_dim, 3, padding=1),
+ # )
+ # self.init_conv_mask = ConditionEncoder(down_dim_mults=cond_dim_mults, dim=cond_dim,
+ # in_dim=cond_in_dim, out_dim=init_dim)
+
+ if cfg.cond_net == 'effnet':
+ f_condnet = 48
+ if cfg.get('without_pretrain', False):
+ self.init_conv_mask = efficientnet_b7()
+ else:
+ self.init_conv_mask = efficientnet_b7(weights=EfficientNet_B7_Weights)
+ elif cfg.cond_net == 'resnet':
+ f_condnet = 256
+ if cfg.get('without_pretrain', False):
+ self.init_conv_mask = resnet101()
+ else:
+ self.init_conv_mask = resnet101(weights=ResNet101_Weights)
+ elif cfg.cond_net == 'swin':
+ f_condnet = 128
+ if cfg.get('without_pretrain', False):
+ self.init_conv_mask = swin_b()
+ else:
+ swin_b_model = swin_b(pretrained=False)
+ swin_b_model.load_state_dict(torch.load(custom_torch_download(filename="swin_b-68c6b09e.pth")), strict=False)
+ self.init_conv_mask = swin_b_model
+ elif cfg.cond_net == 'vgg':
+ f_condnet = 128
+ if cfg.get('without_pretrain', False):
+ self.init_conv_mask = vgg16()
+ else:
+ self.init_conv_mask = vgg16(weights=VGG16_Weights)
+ else:
+ raise NotImplementedError
+ self.init_conv = nn.Sequential(
+ nn.Conv2d(input_channels + f_condnet, init_dim, 7, padding=3),
+ nn.GroupNorm(num_groups=min(init_dim // 4, 8), num_channels=init_dim),
+ )
+
+ if self.cond_pe:
+ self.cond_pos_embedding = nn.Sequential(
+ PositionEmbeddingLearned(
+ feature_size=cfg.cond_feature_size, num_pos_feats=cfg.num_pos_feats//2),
+ nn.Conv2d(num_pos_feats + init_dim, init_dim, 1)
+ )
+ # self.init_conv_mask = nn.Conv2d(1, init_dim, 7, padding=3)
+
+ dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+ dims_rev = dims[::-1]
+ in_out = list(zip(dims[:-1], dims[1:]))
+ self.projects = nn.ModuleList()
+ print(cfg.cond_net)
+ if cfg.cond_net == 'effnet':
+ self.projects.append(nn.Conv2d(48, dims[0], 1))
+ self.projects.append(nn.Conv2d(80, dims[1], 1))
+ self.projects.append(nn.Conv2d(224, dims[2], 1))
+ self.projects.append(nn.Conv2d(640, dims[3], 1))
+ print(len(self.projects))
+ elif cfg.cond_net == 'vgg':
+ self.projects.append(nn.Conv2d(128, dims[0], 1))
+ self.projects.append(nn.Conv2d(256, dims[1], 1))
+ self.projects.append(nn.Conv2d(512, dims[2], 1))
+ self.projects.append(nn.Conv2d(512, dims[3], 1))
+ else:
+ self.projects.append(nn.Conv2d(f_condnet, dims[0], 1))
+ self.projects.append(nn.Conv2d(f_condnet*2, dims[1], 1))
+ self.projects.append(nn.Conv2d(f_condnet*4, dims[2], 1))
+ self.projects.append(nn.Conv2d(f_condnet*8, dims[3], 1))
+ #print(len(self.projects))
+
+ block_klass = partial(ResnetBlock, groups = resnet_block_groups)
+
+ # time embeddings
+
+ time_dim = dim * 4
+
+ self.random_or_learned_sinusoidal_cond = learned_sinusoidal_cond or random_fourier_features
+
+ if self.random_or_learned_sinusoidal_cond:
+ sinu_pos_emb = RandomOrLearnedSinusoidalPosEmb(learned_sinusoidal_dim, random_fourier_features)
+ fourier_dim = learned_sinusoidal_dim + 1
+ else:
+ sinu_pos_emb = GaussianFourierProjection(dim//2, scale=fourier_scale)
+ fourier_dim = dim
+
+ self.time_mlp = nn.Sequential(
+ sinu_pos_emb,
+ nn.Linear(fourier_dim, time_dim),
+ nn.GELU(),
+ nn.Linear(time_dim, time_dim)
+ )
+
+ # layers
+
+ self.downs = nn.ModuleList([])
+ self.downs_mask = nn.ModuleList([])
+ self.ups = nn.ModuleList([])
+ self.relation_layers_down = nn.ModuleList([])
+ self.relation_layers_up = nn.ModuleList([])
+ self.ups2 = nn.ModuleList([])
+ self.relation_layers_up2 = nn.ModuleList([])
+ num_resolutions = len(in_out)
+ input_size = cfg.get('input_size', [80, 80])
+ feature_size_list = [[int(input_size[0]/2**k), int(input_size[1]/2**k)] for k in range(len(dim_mults))]
+
+
+ for ind, (dim_in, dim_out) in enumerate(in_out):
+ is_last = ind >= (num_resolutions - 1)
+
+ self.downs.append(nn.ModuleList([
+ block_klass(dim_in, dim_in, time_emb_dim = time_dim),
+ block_klass(dim_in, dim_in, time_emb_dim = time_dim),
+ Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+ Downsample(dim_in, dim_out) if not is_last else nn.Conv2d(dim_in, dim_out, 3, padding = 1)
+ ]))
+ # self.downs_mask.append(nn.ModuleList([
+ # block_klass(dim_in, dim_in, time_emb_dim=time_dim),
+ # # block_klass(dim_in, dim_in, time_emb_dim=time_dim),
+ # Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+ # Downsample(dim_in, dim_out) if not is_last else nn.Conv2d(dim_in, dim_out, 3, padding=1)
+ # ]))
+ self.relation_layers_down.append(RelationNet(in_channel1=dims[ind], in_channel2=dims[ind], nhead=8,
+ layers=1, embed_dim=dims[ind], ffn_dim=dims[ind]*2,
+ window_size1=window_sizes1[ind], window_size2=window_sizes2[ind])
+ )
+
+ mid_dim = dims[-1]
+ self.mid_block1 = block_klass(mid_dim, mid_dim, time_emb_dim = time_dim)
+ self.mid_attn = Residual(PreNorm(mid_dim, Attention(mid_dim)))
+ self.mid_block2 = block_klass(mid_dim, mid_dim, time_emb_dim = time_dim)
+ self.decouple1 = nn.Sequential(
+ nn.GroupNorm(num_groups=min(mid_dim // 4, 8), num_channels=mid_dim),
+ nn.Conv2d(mid_dim, mid_dim, 3, padding=1),
+ BlockFFT(mid_dim, input_size[0]//8, input_size[1]//8),
+ )
+ self.decouple2 = nn.Sequential(
+ nn.GroupNorm(num_groups=min(mid_dim // 4, 8), num_channels=mid_dim),
+ nn.Conv2d(mid_dim, mid_dim, 3, padding=1),
+ BlockFFT(mid_dim, input_size[0]//8, input_size[1]//8),
+ )
+
+ for ind, (dim_in, dim_out) in enumerate(reversed(in_out)):
+ is_last = ind == (len(in_out) - 1)
+
+ self.ups.append(nn.ModuleList([
+ block_klass(dim_out + dim_in, dim_out, time_emb_dim=time_dim),
+ block_klass(dim_out + dim_in, dim_out, time_emb_dim=time_dim),
+ Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+ Upsample(dim_out, dim_in) if not is_last else nn.Conv2d(dim_out, dim_in, 3, padding = 1)
+ ]))
+ self.relation_layers_up.append(RelationNet(in_channel1=dims_rev[ind+1], in_channel2=dims_rev[ind],
+ nhead=8, layers=1, embed_dim=dims_rev[ind],
+ ffn_dim=dims_rev[ind] * 2,
+ window_size1=window_sizes1[::-1][ind],
+ window_size2=window_sizes2[::-1][ind])
+ )
+ self.ups2.append(nn.ModuleList([
+ block_klass(dim_out + dim_in, dim_out, time_emb_dim=time_dim),
+ block_klass(dim_out + dim_in, dim_out, time_emb_dim=time_dim),
+ Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+ Upsample(dim_out, dim_in) if not is_last else nn.Conv2d(dim_out, dim_in, 3, padding=1)
+ ]))
+ self.relation_layers_up2.append(RelationNet(in_channel1=dims_rev[ind + 1], in_channel2=dims_rev[ind],
+ nhead=8, layers=1, embed_dim=dims_rev[ind],
+ ffn_dim=dims_rev[ind] * 2,
+ window_size1=window_sizes1[::-1][ind],
+ window_size2=window_sizes2[::-1][ind])
+ )
+
+ default_out_dim = channels * (1 if not learned_variance else 2)
+ self.out_dim = default(out_dim, default_out_dim)
+
+ self.final_res_block = block_klass(dim * 2, dim, time_emb_dim = time_dim)
+ self.final_conv = nn.Conv2d(dim, self.out_dim * out_mul, 1)
+
+ self.final_res_block2 = block_klass(dim * 2, dim, time_emb_dim = time_dim)
+ self.final_conv2 = nn.Conv2d(dim, self.out_dim, 1)
+
+ # self.init_weights()
+ if ckpt_path is not None:
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+ fix_bb = cfg.get('fix_bb', True)
+ if fix_bb:
+ for n, p in self.init_conv_mask.named_parameters():
+ p.requires_grad = False
+
+ def init_from_ckpt(self, path, ignore_keys=list()):
+ sd = torch.load(path, map_location="cpu")["model"]
+ keys = list(sd.keys())
+ for k in keys:
+ for ik in ignore_keys:
+ if k.startswith(ik):
+ print("Deleting key {} from state_dict.".format(k))
+ del sd[k]
+ msg = self.load_state_dict(sd, strict=False)
+ print(f"Restored from {path}")
+ print('==>Load Unet Info: ', msg)
+
+ def init_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0.)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1.)
+ nn.init.constant_(m.bias, 0.)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.weight, 1.)
+ nn.init.constant_(m.bias, 0.)
+ elif isinstance(m, nn.Linear):
+ nn.init.xavier_normal_(m.weight)
+ nn.init.constant_(m.bias, 0.)
+
+ def forward(self, x, time, mask, x_self_cond = None, **kwargs):
+ if self.self_condition:
+ x_self_cond = default(x_self_cond, lambda: torch.zeros_like(x))
+ x = torch.cat((x_self_cond, x), dim = 1)
+ sigma = time.reshape(-1, 1, 1, 1)
+ eps = 1e-4
+ c_skip1 = 1 - sigma
+ c_skip2 = torch.sqrt(sigma)
+ # c_out = sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2).sqrt()
+ c_out1 = sigma / torch.sqrt(sigma ** 2 + 1)
+ c_out2 = torch.sqrt(1 - sigma) / torch.sqrt(sigma ** 2 + 1)
+ c_in = 1
+
+ x_clone = x.clone()
+ x = c_in * x
+ # mask = torch.cat([], dim=1)
+ hm = self.init_conv_mask(mask)
+ # if self.cond_pe:
+ # m = self.cond_pos_embedding(m)
+ x = self.init_conv(torch.cat([x, F.interpolate(hm[0], size=x.shape[-2:], mode="bilinear")], dim=1))
+ r = x.clone()
+
+ t = self.time_mlp(torch.log(time)/4)
+
+ h = []
+ h2 = []
+ for i, layer in enumerate(self.projects):
+ # print(hm[i].shape)
+ hm[i] = layer(hm[i])
+ hm2 = []
+ for i in range(len(hm)):
+ hm2.append(hm[i].clone())
+ # hm = []
+ # hm2 = []
+ for i, ((block1, block2, attn, downsample), relation_layer) \
+ in enumerate(zip(self.downs, self.relation_layers_down)):
+ x = block1(x, t)
+ h.append(x)
+ h2.append(x.clone())
+ # m = m_block(m, t)
+ # hm.append(m)
+ # hm2.append(m.clone())
+
+ x = relation_layer(hm[i], x)
+
+ x = block2(x, t)
+ x = attn(x)
+ h.append(x)
+ h2.append(x.clone())
+
+ x = downsample(x)
+ # m = m_downsample(m)
+
+
+ # x = x + F.interpolate(hm[-1], size=x.shape[2:], mode="bilinear", align_corners=True)
+ x = self.mid_block1(x, t)
+ x = self.mid_attn(x)
+ x = self.mid_block2(x, t)
+ x1 = x + self.decouple1(x)
+ x2 = x + self.decouple2(x)
+
+ x = x1
+ for (block1, block2, attn, upsample), relation_layer in zip(self.ups, self.relation_layers_up):
+ x = torch.cat((x, h.pop()), dim = 1)
+ x = block1(x, t)
+ x = relation_layer(hm.pop(), x)
+ x = torch.cat((x, h.pop()), dim = 1)
+ x = block2(x, t)
+ x = attn(x)
+ x = upsample(x)
+
+ x1 = torch.cat((x, r), dim=1)
+ x1 = self.final_res_block(x1, t)
+ x1 = self.final_conv(x1)
+
+ x = x2
+ for (block1, block2, attn, upsample), relation_layer in zip(self.ups2, self.relation_layers_up2):
+ x = torch.cat((x, h2.pop()), dim = 1)
+ x = block1(x, t)
+ x = relation_layer(hm2.pop(), x)
+ x = torch.cat((x, h2.pop()), dim = 1)
+ x = block2(x, t)
+ x = attn(x)
+ x = upsample(x)
+
+ x2 = torch.cat((x, r), dim=1)
+ x2 = self.final_res_block2(x2, t)
+ x2 = self.final_conv2(x2)
+ # sigma = time.reshape(x1.shape[0], *((1,) * (len(x1.shape) - 1)))
+ # scale_C = torch.exp(sigma)
+ x1 = c_skip1 * x_clone + c_out1 * x1
+ x2 = c_skip2 * x_clone + c_out2 * x2
+ return x1, x2
+
+
+if __name__ == "__main__":
+ # resnet = resnet101(weights=ResNet101_Weights)
+ # effnet = efficientnet_b7(weights=EfficientNet_B7_Weights)
+ # effnet = efficientnet_b7(weights=None)
+ # x = torch.rand(1, 3, 320, 320)
+ # y = effnet(x)
+ model = Unet(dim=128, dim_mults=(1, 2, 4, 4),
+ cond_dim=128,
+ cond_dim_mults=(2, 4, ),
+ channels=1,
+ window_sizes1=[[8, 8], [4, 4], [2, 2], [1, 1]],
+ window_sizes2=[[8, 8], [4, 4], [2, 2], [1, 1]],
+ cfg=fvcore.common.config.CfgNode({'cond_pe': False, 'input_size': [80, 80],
+ 'cond_feature_size': (32, 128), 'cond_net': 'vgg',
+ 'num_pos_feats': 96})
+ )
+ x = torch.rand(1, 1, 80, 80)
+ mask = torch.rand(1, 3, 320, 320)
+ time = torch.tensor([0.5124])
+ with torch.no_grad():
+ y = model(x, time, mask)
+ pass
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/quantization.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..60210b609c7e52b48d079b25e7a29b26ef861d43
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/quantization.py
@@ -0,0 +1,103 @@
+import torch
+from torch import nn as nn
+from torch.nn import Parameter
+
+
+def weight_quantization(b):
+ def uniform_quant(x, b):
+ xdiv = x.mul((2 ** b - 1))
+ xhard = xdiv.round().div(2 ** b - 1)
+ return xhard
+
+ class _pq(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, input, alpha):
+ input.div_(alpha) # weights are first divided by alpha
+ input_c = input.clamp(min=-1, max=1) # then clipped to [-1,1]
+ sign = input_c.sign()
+ input_abs = input_c.abs()
+ input_q = uniform_quant(input_abs, b).mul(sign)
+ ctx.save_for_backward(input, input_q)
+ input_q = input_q.mul(alpha) # rescale to the original range
+ return input_q
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ grad_input = grad_output.clone() # grad for weights will not be clipped
+ input, input_q = ctx.saved_tensors
+ i = (input.abs() > 1.).float()
+ sign = input.sign()
+ grad_alpha = (grad_output * (sign * i + (input_q - input) * (1 - i))).sum()
+ return grad_input, grad_alpha
+
+ return _pq().apply
+
+
+class weight_quantize_fn(nn.Module):
+ def __init__(self, bit_w):
+ super(weight_quantize_fn, self).__init__()
+ assert bit_w > 0
+
+ self.bit_w = bit_w - 1
+ self.weight_q = weight_quantization(b=self.bit_w)
+ self.register_parameter('w_alpha', Parameter(torch.tensor(3.0), requires_grad=True))
+
+ def forward(self, weight):
+ mean = weight.data.mean()
+ std = weight.data.std()
+ weight = weight.add(-mean).div(std) # weights normalization
+ weight_q = self.weight_q(weight, self.w_alpha)
+ return weight_q
+
+ def change_bit(self, bit_w):
+ self.bit_w = bit_w - 1
+ self.weight_q = weight_quantization(b=self.bit_w)
+
+def act_quantization(b, signed=False):
+ def uniform_quant(x, b=3):
+ xdiv = x.mul(2 ** b - 1)
+ xhard = xdiv.round().div(2 ** b - 1)
+ return xhard
+
+ class _uq(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, input, alpha):
+ input = input.div(alpha)
+ input_c = input.clamp(min=-1, max=1) if signed else input.clamp(max=1)
+ input_q = uniform_quant(input_c, b)
+ ctx.save_for_backward(input, input_q)
+ input_q = input_q.mul(alpha)
+ return input_q
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ grad_input = grad_output.clone()
+ input, input_q = ctx.saved_tensors
+ i = (input.abs() > 1.).float()
+ sign = input.sign()
+ grad_alpha = (grad_output * (sign * i + (input_q - input) * (1 - i))).sum()
+ grad_input = grad_input * (1 - i)
+ return grad_input, grad_alpha
+
+ return _uq().apply
+
+class act_quantize_fn(nn.Module):
+ def __init__(self, bit_a, signed=False):
+ super(act_quantize_fn, self).__init__()
+ self.bit_a = bit_a
+ self.signed = signed
+ if signed:
+ self.bit_a -= 1
+ assert bit_a > 0
+
+ self.act_q = act_quantization(b=self.bit_a, signed=signed)
+ self.register_parameter('a_alpha', Parameter(torch.tensor(8.0), requires_grad=True))
+
+ def forward(self, x):
+ return self.act_q(x, self.a_alpha)
+
+ def change_bit(self, bit_a):
+ self.bit_a = bit_a
+ if self.signed:
+ self.bit_a -= 1
+ self.act_q = act_quantization(b=self.bit_a, signed=self.signed)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/resnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4122de4ec241e4e6370619d4a702a8856dbe47e2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/resnet.py
@@ -0,0 +1,963 @@
+from functools import partial
+from typing import Type, Any, Callable, Union, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from torchvision.transforms._presets import ImageClassification
+from torchvision.utils import _log_api_usage_once
+from torchvision.models._api import WeightsEnum, Weights
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import handle_legacy_interface, _ovewrite_named_param
+
+
+__all__ = [
+ "ResNet",
+ "ResNet18_Weights",
+ "ResNet34_Weights",
+ "ResNet50_Weights",
+ "ResNet101_Weights",
+ "ResNet152_Weights",
+ "ResNeXt50_32X4D_Weights",
+ "ResNeXt101_32X8D_Weights",
+ "ResNeXt101_64X4D_Weights",
+ "Wide_ResNet50_2_Weights",
+ "Wide_ResNet101_2_Weights",
+ "resnet18",
+ "resnet34",
+ "resnet50",
+ "resnet101",
+ "resnet152",
+ "resnext50_32x4d",
+ "resnext101_32x8d",
+ "resnext101_64x4d",
+ "wide_resnet50_2",
+ "wide_resnet101_2",
+]
+
+
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+ """3x3 convolution with padding"""
+ return nn.Conv2d(
+ in_planes,
+ out_planes,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ groups=groups,
+ bias=False,
+ dilation=dilation,
+ )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+ """1x1 convolution"""
+ return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+ expansion: int = 1
+
+ def __init__(
+ self,
+ inplanes: int,
+ planes: int,
+ stride: int = 1,
+ downsample: Optional[nn.Module] = None,
+ groups: int = 1,
+ base_width: int = 64,
+ dilation: int = 1,
+ norm_layer: Optional[Callable[..., nn.Module]] = None,
+ ) -> None:
+ super().__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+ if groups != 1 or base_width != 64:
+ raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+ if dilation > 1:
+ raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+ # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv3x3(inplanes, planes, stride)
+ self.bn1 = norm_layer(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.conv2 = conv3x3(planes, planes)
+ self.bn2 = norm_layer(planes)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x: Tensor) -> Tensor:
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class Bottleneck(nn.Module):
+ # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+ # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+ # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+ # This variant is also known as ResNet V1.5 and improves accuracy according to
+ # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+ expansion: int = 4
+
+ def __init__(
+ self,
+ inplanes: int,
+ planes: int,
+ stride: int = 1,
+ downsample: Optional[nn.Module] = None,
+ groups: int = 1,
+ base_width: int = 64,
+ dilation: int = 1,
+ norm_layer: Optional[Callable[..., nn.Module]] = None,
+ ) -> None:
+ super().__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+ width = int(planes * (base_width / 64.0)) * groups
+ # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv1x1(inplanes, width)
+ self.bn1 = norm_layer(width)
+ self.conv2 = conv3x3(width, width, stride, groups, dilation)
+ self.bn2 = norm_layer(width)
+ self.conv3 = conv1x1(width, planes * self.expansion)
+ self.bn3 = norm_layer(planes * self.expansion)
+ self.relu = nn.ReLU(inplace=True)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x: Tensor) -> Tensor:
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class ResNet(nn.Module):
+ def __init__(
+ self,
+ block: Type[Union[BasicBlock, Bottleneck]],
+ layers: List[int],
+ num_classes: int = 1000,
+ zero_init_residual: bool = False,
+ groups: int = 1,
+ width_per_group: int = 64,
+ replace_stride_with_dilation: Optional[List[bool]] = None,
+ norm_layer: Optional[Callable[..., nn.Module]] = None,
+ ) -> None:
+ super().__init__()
+ _log_api_usage_once(self)
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+ self._norm_layer = norm_layer
+
+ self.inplanes = 64
+ self.dilation = 1
+ if replace_stride_with_dilation is None:
+ # each element in the tuple indicates if we should replace
+ # the 2x2 stride with a dilated convolution instead
+ replace_stride_with_dilation = [False, False, False]
+ if len(replace_stride_with_dilation) != 3:
+ raise ValueError(
+ "replace_stride_with_dilation should be None "
+ f"or a 3-element tuple, got {replace_stride_with_dilation}"
+ )
+ self.groups = groups
+ self.base_width = width_per_group
+ self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+ self.bn1 = norm_layer(self.inplanes)
+ self.relu = nn.ReLU(inplace=True)
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+ self.layer1 = self._make_layer(block, 64, layers[0])
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+ # self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+ # self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+
+ # Zero-initialize the last BN in each residual branch,
+ # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+ # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+ if zero_init_residual:
+ for m in self.modules():
+ if isinstance(m, Bottleneck) and m.bn3.weight is not None:
+ nn.init.constant_(m.bn3.weight, 0) # type: ignore[arg-type]
+ elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
+ nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type]
+
+ def _make_layer(
+ self,
+ block: Type[Union[BasicBlock, Bottleneck]],
+ planes: int,
+ blocks: int,
+ stride: int = 1,
+ dilate: bool = False,
+ ) -> nn.Sequential:
+ norm_layer = self._norm_layer
+ downsample = None
+ previous_dilation = self.dilation
+ if dilate:
+ self.dilation *= stride
+ stride = 1
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ conv1x1(self.inplanes, planes * block.expansion, stride),
+ norm_layer(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(
+ block(
+ self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
+ )
+ )
+ self.inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(
+ block(
+ self.inplanes,
+ planes,
+ groups=self.groups,
+ base_width=self.base_width,
+ dilation=self.dilation,
+ norm_layer=norm_layer,
+ )
+ )
+
+ return nn.Sequential(*layers)
+
+ def _forward_impl(self, x: Tensor) -> Tensor:
+ # See note [TorchScript super()]
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+ feats = []
+ x = self.layer1(x)
+ feats.append(x)
+ x = self.layer2(x)
+ feats.append(x)
+ x = self.layer3(x)
+ feats.append(x)
+ x = self.layer4(x)
+ feats.append(x)
+
+ # x = self.avgpool(x)
+ # x = torch.flatten(x, 1)
+ # x = self.fc(x)
+
+ return feats
+
+ def forward(self, x: Tensor) -> Tensor:
+ return self._forward_impl(x)
+
+
+def _resnet(
+ block: Type[Union[BasicBlock, Bottleneck]],
+ layers: List[int],
+ weights: Optional[WeightsEnum],
+ progress: bool,
+ **kwargs: Any,
+) -> ResNet:
+ if weights is not None:
+ _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+ model = ResNet(block, layers, **kwargs)
+
+ if weights is not None:
+ model.load_state_dict(weights.get_state_dict(progress=progress), strict=False)
+
+ return model
+
+
+_COMMON_META = {
+ "min_size": (1, 1),
+ "categories": _IMAGENET_CATEGORIES,
+}
+
+
+class ResNet18_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnet18-f37072fd.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 11689512,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 69.758,
+ "acc@5": 89.078,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class ResNet34_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnet34-b627a593.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 21797672,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 73.314,
+ "acc@5": 91.420,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class ResNet50_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnet50-0676ba61.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 25557032,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 76.130,
+ "acc@5": 92.862,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/resnet50-11ad3fa6.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 25557032,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#issuecomment-1013906621",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 80.858,
+ "acc@5": 95.434,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class ResNet101_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnet101-63fe2227.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 44549160,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 77.374,
+ "acc@5": 93.546,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/resnet101-cd907fc2.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 44549160,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 81.886,
+ "acc@5": 95.780,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class ResNet152_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnet152-394f9c45.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 60192808,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 78.312,
+ "acc@5": 94.046,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/resnet152-f82ba261.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 60192808,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 82.284,
+ "acc@5": 96.002,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class ResNeXt50_32X4D_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 25028904,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 77.618,
+ "acc@5": 93.698,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 25028904,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 81.198,
+ "acc@5": 95.340,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class ResNeXt101_32X8D_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 88791336,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 79.312,
+ "acc@5": 94.526,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/resnext101_32x8d-110c445d.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 88791336,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 82.834,
+ "acc@5": 96.228,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class ResNeXt101_64X4D_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/resnext101_64x4d-173b62eb.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 83455272,
+ "recipe": "https://github.com/pytorch/vision/pull/5935",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 83.246,
+ "acc@5": 96.454,
+ }
+ },
+ "_docs": """
+ These weights were trained from scratch by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class Wide_ResNet50_2_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 68883240,
+ "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 78.468,
+ "acc@5": 94.086,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/wide_resnet50_2-9ba9bcbe.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 68883240,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 81.602,
+ "acc@5": 95.758,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+class Wide_ResNet101_2_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 126886696,
+ "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 78.848,
+ "acc@5": 94.284,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
+ },
+ )
+ IMAGENET1K_V2 = Weights(
+ url="https://download.pytorch.org/models/wide_resnet101_2-d733dc28.pth",
+ transforms=partial(ImageClassification, crop_size=224, resize_size=232),
+ meta={
+ **_COMMON_META,
+ "num_params": 126886696,
+ "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 82.510,
+ "acc@5": 96.020,
+ }
+ },
+ "_docs": """
+ These weights improve upon the results of the original paper by using TorchVision's `new training recipe
+ `_.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V2
+
+
+@handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
+def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+ """ResNet-18 from `Deep Residual Learning for Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNet18_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.ResNet18_Weights
+ :members:
+ """
+ weights = ResNet18_Weights.verify(weights)
+
+ return _resnet(BasicBlock, [2, 2, 2, 2], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
+def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+ """ResNet-34 from `Deep Residual Learning for Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNet34_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.ResNet34_Weights
+ :members:
+ """
+ weights = ResNet34_Weights.verify(weights)
+
+ return _resnet(BasicBlock, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
+def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+ """ResNet-50 from `Deep Residual Learning for Image Recognition `__.
+
+ .. note::
+ The bottleneck of TorchVision places the stride for downsampling to the second 3x3
+ convolution while the original paper places it to the first 1x1 convolution.
+ This variant improves the accuracy and is known as `ResNet V1.5
+ `_.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNet50_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.ResNet50_Weights
+ :members:
+ """
+ weights = ResNet50_Weights.verify(weights)
+
+ return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
+def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+ """ResNet-101 from `Deep Residual Learning for Image Recognition `__.
+
+ .. note::
+ The bottleneck of TorchVision places the stride for downsampling to the second 3x3
+ convolution while the original paper places it to the first 1x1 convolution.
+ This variant improves the accuracy and is known as `ResNet V1.5
+ `_.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNet101_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNet101_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.ResNet101_Weights
+ :members:
+ """
+ weights = ResNet101_Weights.verify(weights)
+
+ return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
+def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
+ """ResNet-152 from `Deep Residual Learning for Image Recognition `__.
+
+ .. note::
+ The bottleneck of TorchVision places the stride for downsampling to the second 3x3
+ convolution while the original paper places it to the first 1x1 convolution.
+ This variant improves the accuracy and is known as `ResNet V1.5
+ `_.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNet152_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNet152_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.ResNet152_Weights
+ :members:
+ """
+ weights = ResNet152_Weights.verify(weights)
+
+ return _resnet(Bottleneck, [3, 8, 36, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", ResNeXt50_32X4D_Weights.IMAGENET1K_V1))
+def resnext50_32x4d(
+ *, weights: Optional[ResNeXt50_32X4D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+ """ResNeXt-50 32x4d model from
+ `Aggregated Residual Transformation for Deep Neural Networks `_.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNeXt50_32X4D_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNext50_32X4D_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.ResNeXt50_32X4D_Weights
+ :members:
+ """
+ weights = ResNeXt50_32X4D_Weights.verify(weights)
+
+ _ovewrite_named_param(kwargs, "groups", 32)
+ _ovewrite_named_param(kwargs, "width_per_group", 4)
+ return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", ResNeXt101_32X8D_Weights.IMAGENET1K_V1))
+def resnext101_32x8d(
+ *, weights: Optional[ResNeXt101_32X8D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+ """ResNeXt-101 32x8d model from
+ `Aggregated Residual Transformation for Deep Neural Networks `_.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNeXt101_32X8D_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNeXt101_32X8D_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.ResNeXt101_32X8D_Weights
+ :members:
+ """
+ weights = ResNeXt101_32X8D_Weights.verify(weights)
+
+ _ovewrite_named_param(kwargs, "groups", 32)
+ _ovewrite_named_param(kwargs, "width_per_group", 8)
+ return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+def resnext101_64x4d(
+ *, weights: Optional[ResNeXt101_64X4D_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+ """ResNeXt-101 64x4d model from
+ `Aggregated Residual Transformation for Deep Neural Networks `_.
+
+ Args:
+ weights (:class:`~torchvision.models.ResNeXt101_64X4D_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.ResNeXt101_64X4D_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.ResNeXt101_64X4D_Weights
+ :members:
+ """
+ weights = ResNeXt101_64X4D_Weights.verify(weights)
+
+ _ovewrite_named_param(kwargs, "groups", 64)
+ _ovewrite_named_param(kwargs, "width_per_group", 4)
+ return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", Wide_ResNet50_2_Weights.IMAGENET1K_V1))
+def wide_resnet50_2(
+ *, weights: Optional[Wide_ResNet50_2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+ """Wide ResNet-50-2 model from
+ `Wide Residual Networks `_.
+
+ The model is the same as ResNet except for the bottleneck number of channels
+ which is twice larger in every block. The number of channels in outer 1x1
+ convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+ channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+ Args:
+ weights (:class:`~torchvision.models.Wide_ResNet50_2_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.Wide_ResNet50_2_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.Wide_ResNet50_2_Weights
+ :members:
+ """
+ weights = Wide_ResNet50_2_Weights.verify(weights)
+
+ _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
+ return _resnet(Bottleneck, [3, 4, 6, 3], weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", Wide_ResNet101_2_Weights.IMAGENET1K_V1))
+def wide_resnet101_2(
+ *, weights: Optional[Wide_ResNet101_2_Weights] = None, progress: bool = True, **kwargs: Any
+) -> ResNet:
+ """Wide ResNet-101-2 model from
+ `Wide Residual Networks `_.
+
+ The model is the same as ResNet except for the bottleneck number of channels
+ which is twice larger in every block. The number of channels in outer 1x1
+ convolutions is the same, e.g. last block in ResNet-101 has 2048-512-2048
+ channels, and in Wide ResNet-101-2 has 2048-1024-2048.
+
+ Args:
+ weights (:class:`~torchvision.models.Wide_ResNet101_2_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.Wide_ResNet101_2_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+ .. autoclass:: torchvision.models.Wide_ResNet101_2_Weights
+ :members:
+ """
+ weights = Wide_ResNet101_2_Weights.verify(weights)
+
+ _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
+ return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
+
+
+# The dictionary below is internal implementation detail and will be removed in v0.15
+from torchvision.models._utils import _ModelURLs
+
+
+model_urls = _ModelURLs(
+ {
+ "resnet18": ResNet18_Weights.IMAGENET1K_V1.url,
+ "resnet34": ResNet34_Weights.IMAGENET1K_V1.url,
+ "resnet50": ResNet50_Weights.IMAGENET1K_V1.url,
+ "resnet101": ResNet101_Weights.IMAGENET1K_V1.url,
+ "resnet152": ResNet152_Weights.IMAGENET1K_V1.url,
+ "resnext50_32x4d": ResNeXt50_32X4D_Weights.IMAGENET1K_V1.url,
+ "resnext101_32x8d": ResNeXt101_32X8D_Weights.IMAGENET1K_V1.url,
+ "wide_resnet50_2": Wide_ResNet50_2_Weights.IMAGENET1K_V1.url,
+ "wide_resnet101_2": Wide_ResNet101_2_Weights.IMAGENET1K_V1.url,
+ }
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/swin_transformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f198a88d20279d7bdefd698e4d836ce6071863a8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/swin_transformer.py
@@ -0,0 +1,651 @@
+from functools import partial
+from typing import Optional, Callable, List, Any
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from torchvision.ops.misc import MLP, Permute
+from torchvision.ops.stochastic_depth import StochasticDepth
+from torchvision.transforms._presets import ImageClassification, InterpolationMode
+from torchvision.utils import _log_api_usage_once
+from torchvision.models._api import WeightsEnum, Weights
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import _ovewrite_named_param
+
+
+__all__ = [
+ "SwinTransformer",
+ "Swin_T_Weights",
+ "Swin_S_Weights",
+ "Swin_B_Weights",
+ "swin_t",
+ "swin_s",
+ "swin_b",
+]
+
+
+def _patch_merging_pad(x):
+ H, W, _ = x.shape[-3:]
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+ return x
+
+
+torch.fx.wrap("_patch_merging_pad")
+
+
+class PatchMerging(nn.Module):
+ """Patch Merging Layer.
+ Args:
+ dim (int): Number of input channels.
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+ """
+
+ def __init__(self, dim: int, norm_layer: Callable[..., nn.Module] = nn.LayerNorm):
+ super().__init__()
+ _log_api_usage_once(self)
+ self.dim = dim
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+ self.norm = norm_layer(4 * dim)
+
+ def forward(self, x: Tensor):
+ """
+ Args:
+ x (Tensor): input tensor with expected layout of [..., H, W, C]
+ Returns:
+ Tensor with layout of [..., H/2, W/2, 2*C]
+ """
+ x = _patch_merging_pad(x)
+
+ x0 = x[..., 0::2, 0::2, :] # ... H/2 W/2 C
+ x1 = x[..., 1::2, 0::2, :] # ... H/2 W/2 C
+ x2 = x[..., 0::2, 1::2, :] # ... H/2 W/2 C
+ x3 = x[..., 1::2, 1::2, :] # ... H/2 W/2 C
+ x = torch.cat([x0, x1, x2, x3], -1) # ... H/2 W/2 4*C
+
+ x = self.norm(x)
+ x = self.reduction(x) # ... H/2 W/2 2*C
+ return x
+
+
+def shifted_window_attention(
+ input: Tensor,
+ qkv_weight: Tensor,
+ proj_weight: Tensor,
+ relative_position_bias: Tensor,
+ window_size: List[int],
+ num_heads: int,
+ shift_size: List[int],
+ attention_dropout: float = 0.0,
+ dropout: float = 0.0,
+ qkv_bias: Optional[Tensor] = None,
+ proj_bias: Optional[Tensor] = None,
+):
+ """
+ Window based multi-head self attention (W-MSA) module with relative position bias.
+ It supports both of shifted and non-shifted window.
+ Args:
+ input (Tensor[N, H, W, C]): The input tensor or 4-dimensions.
+ qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
+ proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
+ relative_position_bias (Tensor): The learned relative position bias added to attention.
+ window_size (List[int]): Window size.
+ num_heads (int): Number of attention heads.
+ shift_size (List[int]): Shift size for shifted window attention.
+ attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
+ dropout (float): Dropout ratio of output. Default: 0.0.
+ qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
+ proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+ Returns:
+ Tensor[N, H, W, C]: The output tensor after shifted window attention.
+ """
+ B, H, W, C = input.shape
+ # pad feature maps to multiples of window size
+ pad_r = (window_size[1] - W % window_size[1]) % window_size[1]
+ pad_b = (window_size[0] - H % window_size[0]) % window_size[0]
+ x = F.pad(input, (0, 0, 0, pad_r, 0, pad_b))
+ _, pad_H, pad_W, _ = x.shape
+
+ # If window size is larger than feature size, there is no need to shift window
+ if window_size[0] >= pad_H:
+ shift_size[0] = 0
+ if window_size[1] >= pad_W:
+ shift_size[1] = 0
+
+ # cyclic shift
+ if sum(shift_size) > 0:
+ x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1]), dims=(1, 2))
+
+ # partition windows
+ num_windows = (pad_H // window_size[0]) * (pad_W // window_size[1])
+ x = x.view(B, pad_H // window_size[0], window_size[0], pad_W // window_size[1], window_size[1], C)
+ x = x.permute(0, 1, 3, 2, 4, 5).reshape(B * num_windows, window_size[0] * window_size[1], C) # B*nW, Ws*Ws, C
+
+ # multi-head attention
+ qkv = F.linear(x, qkv_weight, qkv_bias)
+ qkv = qkv.reshape(x.size(0), x.size(1), 3, num_heads, C // num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+ q = q * (C // num_heads) ** -0.5
+ attn = q.matmul(k.transpose(-2, -1))
+ # add relative position bias
+ attn = attn + relative_position_bias
+
+ if sum(shift_size) > 0:
+ # generate attention mask
+ attn_mask = x.new_zeros((pad_H, pad_W))
+ h_slices = ((0, -window_size[0]), (-window_size[0], -shift_size[0]), (-shift_size[0], None))
+ w_slices = ((0, -window_size[1]), (-window_size[1], -shift_size[1]), (-shift_size[1], None))
+ count = 0
+ for h in h_slices:
+ for w in w_slices:
+ attn_mask[h[0] : h[1], w[0] : w[1]] = count
+ count += 1
+ attn_mask = attn_mask.view(pad_H // window_size[0], window_size[0], pad_W // window_size[1], window_size[1])
+ attn_mask = attn_mask.permute(0, 2, 1, 3).reshape(num_windows, window_size[0] * window_size[1])
+ attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2)
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+ attn = attn.view(x.size(0) // num_windows, num_windows, num_heads, x.size(1), x.size(1))
+ attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
+ attn = attn.view(-1, num_heads, x.size(1), x.size(1))
+
+ attn = F.softmax(attn, dim=-1)
+ attn = F.dropout(attn, p=attention_dropout)
+
+ x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C)
+ x = F.linear(x, proj_weight, proj_bias)
+ x = F.dropout(x, p=dropout)
+
+ # reverse windows
+ x = x.view(B, pad_H // window_size[0], pad_W // window_size[1], window_size[0], window_size[1], C)
+ x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, pad_H, pad_W, C)
+
+ # reverse cyclic shift
+ if sum(shift_size) > 0:
+ x = torch.roll(x, shifts=(shift_size[0], shift_size[1]), dims=(1, 2))
+
+ # unpad features
+ x = x[:, :H, :W, :].contiguous()
+ return x
+
+
+torch.fx.wrap("shifted_window_attention")
+
+
+class ShiftedWindowAttention(nn.Module):
+ """
+ See :func:`shifted_window_attention`.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ window_size: List[int],
+ shift_size: List[int],
+ num_heads: int,
+ qkv_bias: bool = True,
+ proj_bias: bool = True,
+ attention_dropout: float = 0.0,
+ dropout: float = 0.0,
+ ):
+ super().__init__()
+ if len(window_size) != 2 or len(shift_size) != 2:
+ raise ValueError("window_size and shift_size must be of length 2")
+ self.window_size = window_size
+ self.shift_size = shift_size
+ self.num_heads = num_heads
+ self.attention_dropout = attention_dropout
+ self.dropout = dropout
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
+
+ # define a parameter table of relative position bias
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+ ) # 2*Wh-1 * 2*Ww-1, nH
+
+ # get pair-wise relative position index for each token inside the window
+ coords_h = torch.arange(self.window_size[0])
+ coords_w = torch.arange(self.window_size[1])
+ # coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij")) # 2, Wh, Ww
+ coords = torch.stack(torch.meshgrid(coords_h, coords_w)) # 2, Wh, Ww
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
+ relative_coords[:, :, 1] += self.window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+ relative_position_index = relative_coords.sum(-1).view(-1) # Wh*Ww*Wh*Ww
+ self.register_buffer("relative_position_index", relative_position_index)
+
+ nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+ def forward(self, x: Tensor):
+ """
+ Args:
+ x (Tensor): Tensor with layout of [B, H, W, C]
+ Returns:
+ Tensor with same layout as input, i.e. [B, H, W, C]
+ """
+
+ N = self.window_size[0] * self.window_size[1]
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index] # type: ignore[index]
+ relative_position_bias = relative_position_bias.view(N, N, -1)
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().unsqueeze(0)
+
+ return shifted_window_attention(
+ x,
+ self.qkv.weight,
+ self.proj.weight,
+ relative_position_bias,
+ self.window_size,
+ self.num_heads,
+ shift_size=self.shift_size,
+ attention_dropout=self.attention_dropout,
+ dropout=self.dropout,
+ qkv_bias=self.qkv.bias,
+ proj_bias=self.proj.bias,
+ )
+
+
+class SwinTransformerBlock(nn.Module):
+ """
+ Swin Transformer Block.
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads.
+ window_size (List[int]): Window size.
+ shift_size (List[int]): Shift size for shifted window attention.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+ dropout (float): Dropout rate. Default: 0.0.
+ attention_dropout (float): Attention dropout rate. Default: 0.0.
+ stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+ attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttention
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ window_size: List[int],
+ shift_size: List[int],
+ mlp_ratio: float = 4.0,
+ dropout: float = 0.0,
+ attention_dropout: float = 0.0,
+ stochastic_depth_prob: float = 0.0,
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+ attn_layer: Callable[..., nn.Module] = ShiftedWindowAttention,
+ ):
+ super().__init__()
+ _log_api_usage_once(self)
+
+ self.norm1 = norm_layer(dim)
+ self.attn = attn_layer(
+ dim,
+ window_size,
+ shift_size,
+ num_heads,
+ attention_dropout=attention_dropout,
+ dropout=dropout,
+ )
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+ self.norm2 = norm_layer(dim)
+ self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
+
+ for m in self.mlp.modules():
+ if isinstance(m, nn.Linear):
+ nn.init.xavier_uniform_(m.weight)
+ if m.bias is not None:
+ nn.init.normal_(m.bias, std=1e-6)
+
+ def forward(self, x: Tensor):
+ x = x + self.stochastic_depth(self.attn(self.norm1(x)))
+ x = x + self.stochastic_depth(self.mlp(self.norm2(x)))
+ return x
+
+
+class SwinTransformer(nn.Module):
+ """
+ Implements Swin Transformer from the `"Swin Transformer: Hierarchical Vision Transformer using
+ Shifted Windows" `_ paper.
+ Args:
+ patch_size (List[int]): Patch size.
+ embed_dim (int): Patch embedding dimension.
+ depths (List(int)): Depth of each Swin Transformer layer.
+ num_heads (List(int)): Number of attention heads in different layers.
+ window_size (List[int]): Window size.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+ dropout (float): Dropout rate. Default: 0.0.
+ attention_dropout (float): Attention dropout rate. Default: 0.0.
+ stochastic_depth_prob (float): Stochastic depth rate. Default: 0.0.
+ num_classes (int): Number of classes for classification head. Default: 1000.
+ block (nn.Module, optional): SwinTransformer Block. Default: None.
+ norm_layer (nn.Module, optional): Normalization layer. Default: None.
+ """
+
+ def __init__(
+ self,
+ patch_size: List[int],
+ embed_dim: int,
+ depths: List[int],
+ num_heads: List[int],
+ window_size: List[int],
+ mlp_ratio: float = 4.0,
+ dropout: float = 0.0,
+ attention_dropout: float = 0.0,
+ stochastic_depth_prob: float = 0.0,
+ num_classes: int = 1000,
+ norm_layer: Optional[Callable[..., nn.Module]] = None,
+ block: Optional[Callable[..., nn.Module]] = None,
+ ):
+ super().__init__()
+ _log_api_usage_once(self)
+ self.num_classes = num_classes
+
+ if block is None:
+ block = SwinTransformerBlock
+
+ if norm_layer is None:
+ norm_layer = partial(nn.LayerNorm, eps=1e-5)
+
+ layers: List[nn.Module] = []
+ # split image into non-overlapping patches
+ # layers.append(
+ # nn.Sequential(
+ # nn.Conv2d(
+ # 3, embed_dim, kernel_size=(patch_size[0], patch_size[1]), stride=(patch_size[0], patch_size[1])
+ # ),
+ # Permute([0, 2, 3, 1]),
+ # norm_layer(embed_dim),
+ # )
+ # )
+ self.first_coonv = nn.Sequential(
+ nn.Conv2d(
+ 3, embed_dim, kernel_size=(patch_size[0], patch_size[1]), stride=(patch_size[0], patch_size[1])
+ ),
+ Permute([0, 2, 3, 1]),
+ norm_layer(embed_dim),
+ )
+
+ total_stage_blocks = sum(depths)
+ stage_block_id = 0
+ # build SwinTransformer blocks
+ for i_stage in range(len(depths)):
+ stage: List[nn.Module] = []
+ dim = embed_dim * 2 ** i_stage
+ for i_layer in range(depths[i_stage]):
+ # adjust stochastic depth probability based on the depth of the stage block
+ sd_prob = stochastic_depth_prob * float(stage_block_id) / (total_stage_blocks - 1)
+ stage.append(
+ block(
+ dim,
+ num_heads[i_stage],
+ window_size=window_size,
+ shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
+ mlp_ratio=mlp_ratio,
+ dropout=dropout,
+ attention_dropout=attention_dropout,
+ stochastic_depth_prob=sd_prob,
+ norm_layer=norm_layer,
+ )
+ )
+ stage_block_id += 1
+ layers.append(nn.Sequential(*stage))
+ # add patch merging layer
+ if i_stage < (len(depths) - 1):
+ layers.append(PatchMerging(dim, norm_layer))
+ # self.features = nn.Sequential(*layers)
+ self.features = nn.ModuleList(layers)
+
+ num_features = embed_dim * 2 ** (len(depths) - 1)
+ self.norm = norm_layer(num_features)
+ self.avgpool = nn.AdaptiveAvgPool2d(1)
+ self.head = nn.Linear(num_features, num_classes)
+
+ for m in self.modules():
+ if isinstance(m, nn.Linear):
+ nn.init.trunc_normal_(m.weight, std=0.02)
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+
+ def forward(self, x):
+ feats = []
+ x = self.first_coonv(x)
+ for i, layer in enumerate(self.features):
+ x = layer(x)
+ if i in [0, 2, 4, 6]:
+ feats.append(x.permute(0, 3, 1, 2).contiguous())
+ # x = self.features(x)
+ # x = self.norm(x)
+ # x = x.permute(0, 3, 1, 2)
+ # x = self.avgpool(x)
+ # x = torch.flatten(x, 1)
+ # x = self.head(x)
+ return feats
+
+
+def _swin_transformer(
+ patch_size: List[int],
+ embed_dim: int,
+ depths: List[int],
+ num_heads: List[int],
+ window_size: List[int],
+ stochastic_depth_prob: float,
+ weights: Optional[WeightsEnum],
+ progress: bool,
+ **kwargs: Any,
+) -> SwinTransformer:
+ if weights is not None:
+ _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+ model = SwinTransformer(
+ patch_size=patch_size,
+ embed_dim=embed_dim,
+ depths=depths,
+ num_heads=num_heads,
+ window_size=window_size,
+ stochastic_depth_prob=stochastic_depth_prob,
+ **kwargs,
+ )
+
+ if weights is not None:
+ ckpt1 = weights.get_state_dict(progress=progress)
+ ckpt2 = model.state_dict()
+ kl1 = list(ckpt1.keys())
+ for i, k in enumerate(list(ckpt2.keys())):
+ ckpt2[k] = ckpt1[kl1[i]]
+ msg = model.load_state_dict(ckpt2, strict=False)
+ print(f'Load swin_transformer: {msg}')
+
+ return model
+
+
+_COMMON_META = {
+ "categories": _IMAGENET_CATEGORIES,
+}
+
+
+class Swin_T_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/swin_t-704ceda3.pth",
+ transforms=partial(
+ ImageClassification, crop_size=224, resize_size=232, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META,
+ "num_params": 28288354,
+ "min_size": (224, 224),
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 81.474,
+ "acc@5": 95.776,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class Swin_S_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/swin_s-5e29d889.pth",
+ transforms=partial(
+ ImageClassification, crop_size=224, resize_size=246, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META,
+ "num_params": 49606258,
+ "min_size": (224, 224),
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 83.196,
+ "acc@5": 96.360,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class Swin_B_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/swin_b-68c6b09e.pth",
+ transforms=partial(
+ ImageClassification, crop_size=224, resize_size=238, interpolation=InterpolationMode.BICUBIC
+ ),
+ meta={
+ **_COMMON_META,
+ "num_params": 87768224,
+ "min_size": (224, 224),
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#swintransformer",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 83.582,
+ "acc@5": 96.640,
+ }
+ },
+ "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+ """
+ Constructs a swin_tiny architecture from
+ `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows `_.
+
+ Args:
+ weights (:class:`~torchvision.models.Swin_T_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.Swin_T_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.Swin_T_Weights
+ :members:
+ """
+ weights = Swin_T_Weights.verify(weights)
+
+ return _swin_transformer(
+ patch_size=[4, 4],
+ embed_dim=96,
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=[7, 7],
+ stochastic_depth_prob=0.2,
+ weights=weights,
+ progress=progress,
+ **kwargs,
+ )
+
+
+def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+ """
+ Constructs a swin_small architecture from
+ `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows `_.
+
+ Args:
+ weights (:class:`~torchvision.models.Swin_S_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.Swin_S_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.Swin_S_Weights
+ :members:
+ """
+ weights = Swin_S_Weights.verify(weights)
+
+ return _swin_transformer(
+ patch_size=[4, 4],
+ embed_dim=96,
+ depths=[2, 2, 18, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=[7, 7],
+ stochastic_depth_prob=0.3,
+ weights=weights,
+ progress=progress,
+ **kwargs,
+ )
+
+
+from torchvision.models._utils import handle_legacy_interface
+@handle_legacy_interface(weights=("pretrained", Swin_B_Weights.IMAGENET1K_V1))
+def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
+ """
+ Constructs a swin_base architecture from
+ `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows `_.
+
+ Args:
+ weights (:class:`~torchvision.models.Swin_B_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.Swin_B_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.Swin_B_Weights
+ :members:
+ """
+ weights = Swin_B_Weights.verify(weights)
+
+ return _swin_transformer(
+ patch_size=[4, 4],
+ embed_dim=128,
+ depths=[2, 2, 18, 2],
+ num_heads=[4, 8, 16, 32],
+ window_size=[7, 7],
+ stochastic_depth_prob=0.5,
+ weights=weights,
+ progress=progress,
+ **kwargs,
+ )
+
+if __name__ == '__main__':
+ model = swin_b(weights=Swin_B_Weights)
+ x = torch.rand(1, 3, 320, 320)
+ y = model(x)
+ pause = 0
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/uncond_unet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/uncond_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3517e95bfa45ad9b2a0e78956714bf6a80fd31c5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/uncond_unet.py
@@ -0,0 +1,376 @@
+import math
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, reduce
+from functools import partial
+
+
+def exists(x):
+ return x is not None
+
+def default(val, d):
+ if exists(val):
+ return val
+ return d() if callable(d) else d
+
+def identity(t, *args, **kwargs):
+ return t
+
+def cycle(dl):
+ while True:
+ for data in dl:
+ yield data
+
+def has_int_squareroot(num):
+ return (math.sqrt(num) ** 2) == num
+
+def num_to_groups(num, divisor):
+ groups = num // divisor
+ remainder = num % divisor
+ arr = [divisor] * groups
+ if remainder > 0:
+ arr.append(remainder)
+ return arr
+
+def convert_image_to_fn(img_type, image):
+ if image.mode != img_type:
+ return image.convert(img_type)
+ return image
+
+# normalization functions
+
+def normalize_to_neg_one_to_one(img):
+ return img * 2 - 1
+
+def unnormalize_to_zero_to_one(t):
+ return (t + 1) * 0.5
+
+# small helper modules
+
+class Residual(nn.Module):
+ def __init__(self, fn):
+ super().__init__()
+ self.fn = fn
+
+ def forward(self, x, *args, **kwargs):
+ return self.fn(x, *args, **kwargs) + x
+
+def Upsample(dim, dim_out = None):
+ return nn.Sequential(
+ nn.Upsample(scale_factor = 2, mode = 'nearest'),
+ nn.Conv2d(dim, default(dim_out, dim), 3, padding = 1)
+ )
+
+def Downsample(dim, dim_out = None):
+ return nn.Conv2d(dim, default(dim_out, dim), 4, 2, 1)
+
+class WeightStandardizedConv2d(nn.Conv2d):
+ """
+ https://arxiv.org/abs/1903.10520
+ weight standardization purportedly works synergistically with group normalization
+ """
+ def forward(self, x):
+ eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+
+ weight = self.weight
+ mean = reduce(weight, 'o ... -> o 1 1 1', 'mean')
+ var = reduce(weight, 'o ... -> o 1 1 1', partial(torch.var, unbiased = False))
+ normalized_weight = (weight - mean) * (var + eps).rsqrt()
+
+ return F.conv2d(x, normalized_weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+class LayerNorm(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+
+ def forward(self, x):
+ eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+ var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
+ mean = torch.mean(x, dim = 1, keepdim = True)
+ return (x - mean) * (var + eps).rsqrt() * self.g
+
+class PreNorm(nn.Module):
+ def __init__(self, dim, fn):
+ super().__init__()
+ self.fn = fn
+ self.norm = LayerNorm(dim)
+
+ def forward(self, x):
+ x = self.norm(x)
+ return self.fn(x)
+
+# sinusoidal positional embeds
+
+class SinusoidalPosEmb(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.dim = dim
+
+ def forward(self, x):
+ device = x.device
+ half_dim = self.dim // 2
+ emb = math.log(10000) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+ emb = x[:, None] * emb[None, :]
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+ return emb
+
+class RandomOrLearnedSinusoidalPosEmb(nn.Module):
+ """ following @crowsonkb 's lead with random (learned optional) sinusoidal pos emb """
+ """ https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """
+
+ def __init__(self, dim, is_random = False):
+ super().__init__()
+ assert (dim % 2) == 0
+ half_dim = dim // 2
+ self.weights = nn.Parameter(torch.randn(half_dim), requires_grad = not is_random)
+
+ def forward(self, x):
+ x = rearrange(x, 'b -> b 1')
+ freqs = x * rearrange(self.weights, 'd -> 1 d') * 2 * math.pi
+ fouriered = torch.cat((freqs.sin(), freqs.cos()), dim = -1)
+ fouriered = torch.cat((x, fouriered), dim = -1)
+ return fouriered
+
+# building block modules
+
+class Block(nn.Module):
+ def __init__(self, dim, dim_out, groups = 8):
+ super().__init__()
+ self.proj = WeightStandardizedConv2d(dim, dim_out, 3, padding = 1)
+ self.norm = nn.GroupNorm(groups, dim_out)
+ self.act = nn.SiLU()
+
+ def forward(self, x, scale_shift = None):
+ x = self.proj(x)
+ x = self.norm(x)
+
+ if exists(scale_shift):
+ scale, shift = scale_shift
+ x = x * (scale + 1) + shift
+
+ x = self.act(x)
+ return x
+
+class ResnetBlock(nn.Module):
+ def __init__(self, dim, dim_out, *, time_emb_dim = None, groups = 8):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.SiLU(),
+ nn.Linear(time_emb_dim, dim_out * 2)
+ ) if exists(time_emb_dim) else None
+
+ self.block1 = Block(dim, dim_out, groups = groups)
+ self.block2 = Block(dim_out, dim_out, groups = groups)
+ self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+
+ def forward(self, x, time_emb = None):
+
+ scale_shift = None
+ if exists(self.mlp) and exists(time_emb):
+ time_emb = self.mlp(time_emb)
+ time_emb = rearrange(time_emb, 'b c -> b c 1 1')
+ scale_shift = time_emb.chunk(2, dim = 1)
+
+ h = self.block1(x, scale_shift = scale_shift)
+
+ h = self.block2(h)
+
+ return h + self.res_conv(x)
+
+class LinearAttention(nn.Module):
+ def __init__(self, dim, heads = 4, dim_head = 32):
+ super().__init__()
+ self.scale = dim_head ** -0.5
+ self.heads = heads
+ hidden_dim = dim_head * heads
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+
+ self.to_out = nn.Sequential(
+ nn.Conv2d(hidden_dim, dim, 1),
+ LayerNorm(dim)
+ )
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+ qkv = self.to_qkv(x).chunk(3, dim = 1)
+ q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h = self.heads), qkv)
+
+ q = q.softmax(dim = -2)
+ k = k.softmax(dim = -1)
+
+ q = q * self.scale
+ v = v / (h * w)
+
+ context = torch.einsum('b h d n, b h e n -> b h d e', k, v)
+
+ out = torch.einsum('b h d e, b h d n -> b h e n', context, q)
+ out = rearrange(out, 'b h c (x y) -> b (h c) x y', h = self.heads, x = h, y = w)
+ return self.to_out(out)
+
+class Attention(nn.Module):
+ def __init__(self, dim, heads = 4, dim_head = 32):
+ super().__init__()
+ self.scale = dim_head ** -0.5
+ self.heads = heads
+ hidden_dim = dim_head * heads
+
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+ self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+ qkv = self.to_qkv(x).chunk(3, dim = 1)
+ q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h = self.heads), qkv)
+
+ q = q * self.scale
+
+ sim = einsum('b h d i, b h d j -> b h i j', q, k)
+ attn = sim.softmax(dim = -1)
+ out = einsum('b h i j, b h d j -> b h i d', attn, v)
+
+ out = rearrange(out, 'b h (x y) d -> b (h d) x y', x = h, y = w)
+ return self.to_out(out)
+
+# model
+
+class Unet(nn.Module):
+ def __init__(
+ self,
+ dim,
+ init_dim = None,
+ out_dim = None,
+ dim_mults=(1, 2, 4, 8),
+ channels = 3,
+ self_condition = False,
+ resnet_block_groups = 8,
+ heads=8,
+ learned_variance = False,
+ learned_sinusoidal_cond = False,
+ random_fourier_features = False,
+ learned_sinusoidal_dim = 16,
+ out_mul=1,
+ ):
+ super().__init__()
+
+ # determine dimensions
+
+ self.channels = channels
+ self.self_condition = self_condition
+ input_channels = channels * (2 if self_condition else 1)
+
+ init_dim = default(init_dim, dim)
+ self.init_conv = nn.Conv2d(input_channels, init_dim, 7, padding = 3)
+
+ dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+ in_out = list(zip(dims[:-1], dims[1:]))
+
+ block_klass = partial(ResnetBlock, groups = resnet_block_groups)
+
+ # time embeddings
+
+ time_dim = dim * 4
+
+ self.random_or_learned_sinusoidal_cond = learned_sinusoidal_cond or random_fourier_features
+
+ if self.random_or_learned_sinusoidal_cond:
+ sinu_pos_emb = RandomOrLearnedSinusoidalPosEmb(learned_sinusoidal_dim, random_fourier_features)
+ fourier_dim = learned_sinusoidal_dim + 1
+ else:
+ sinu_pos_emb = SinusoidalPosEmb(dim)
+ fourier_dim = dim
+
+ self.time_mlp = nn.Sequential(
+ sinu_pos_emb,
+ nn.Linear(fourier_dim, time_dim),
+ nn.GELU(),
+ nn.Linear(time_dim, time_dim)
+ )
+
+ # layers
+
+ self.downs = nn.ModuleList([])
+ self.ups = nn.ModuleList([])
+ num_resolutions = len(in_out)
+
+ for ind, (dim_in, dim_out) in enumerate(in_out):
+ is_last = ind >= (num_resolutions - 1)
+
+ self.downs.append(nn.ModuleList([
+ block_klass(dim_in, dim_in, time_emb_dim = time_dim),
+ block_klass(dim_in, dim_in, time_emb_dim = time_dim),
+ Residual(PreNorm(dim_in, LinearAttention(dim_in, heads=heads))),
+ Downsample(dim_in, dim_out) if not is_last else nn.Conv2d(dim_in, dim_out, 3, padding = 1)
+ ]))
+
+ mid_dim = dims[-1]
+ self.mid_block1 = block_klass(mid_dim, mid_dim, time_emb_dim = time_dim)
+ self.mid_attn = Residual(PreNorm(mid_dim, Attention(mid_dim, heads=heads)))
+ self.mid_block2 = block_klass(mid_dim, mid_dim, time_emb_dim = time_dim)
+
+ for ind, (dim_in, dim_out) in enumerate(reversed(in_out)):
+ is_last = ind == (len(in_out) - 1)
+
+ self.ups.append(nn.ModuleList([
+ block_klass(dim_out + dim_in, dim_out, time_emb_dim = time_dim),
+ block_klass(dim_out + dim_in, dim_out, time_emb_dim = time_dim),
+ Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+ Upsample(dim_out, dim_in) if not is_last else nn.Conv2d(dim_out, dim_in, 3, padding = 1)
+ ]))
+
+ default_out_dim = channels * out_mul
+ self.out_dim = default(out_dim, default_out_dim)
+
+ self.final_res_block = block_klass(dim * 2, dim, time_emb_dim = time_dim)
+ self.final_conv = nn.Conv2d(dim, self.out_dim, 1)
+
+ def forward(self, x, time, cond=None, x_self_cond=None): ## cond is always None for unconditional model
+ if self.self_condition:
+ x_self_cond = default(x_self_cond, lambda: torch.zeros_like(x))
+ x = torch.cat((x_self_cond, x), dim = 1)
+
+ x = self.init_conv(x)
+ r = x.clone()
+
+ t = self.time_mlp(time)
+
+ h = []
+
+ for block1, block2, attn, downsample in self.downs:
+ x = block1(x, t)
+ h.append(x)
+
+ x = block2(x, t)
+ x = attn(x)
+ h.append(x)
+
+ x = downsample(x)
+
+ x = self.mid_block1(x, t)
+ x = self.mid_attn(x)
+ x = self.mid_block2(x, t)
+
+ for block1, block2, attn, upsample in self.ups:
+ x = torch.cat((x, h.pop()), dim = 1)
+ x = block1(x, t)
+
+ x = torch.cat((x, h.pop()), dim = 1)
+ x = block2(x, t)
+ x = attn(x)
+
+ x = upsample(x)
+
+ x = torch.cat((x, r), dim = 1)
+
+ x = self.final_res_block(x, t)
+ return self.final_conv(x)
+
+if __name__ == '__main__':
+ model = Unet(96, out_mul=2, dim_mults=[1,2,4,8], heads=8)
+ x = torch.rand(2, 3, 8, 8)
+ time = torch.tensor([2, 5])
+ with torch.no_grad():
+ y = model(x, time)
+ pass
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1c28dd79ede65926a1fc1c1dd91e2d6e6e998bd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/utils.py
@@ -0,0 +1,68 @@
+import os
+from pathlib import Path
+import time
+import logging
+import math
+
+def create_logger(root_dir, des=''):
+ root_output_dir = Path(root_dir)
+ # set up logger
+ if not root_output_dir.exists():
+ print('=> creating {}'.format(root_output_dir))
+ root_output_dir.mkdir(exist_ok=True, parents=True)
+ time_str = time.strftime('%Y-%m-%d-%H-%M')
+ log_file = '{}_{}.log'.format(time_str, des)
+ final_log_file = root_output_dir / log_file
+ head = '%(asctime)-15s %(message)s'
+ logging.basicConfig(filename=str(final_log_file), format=head)
+ logger = logging.getLogger()
+ logger.setLevel(logging.INFO)
+ console = logging.StreamHandler()
+ logging.getLogger('').addHandler(console)
+ return logger
+
+def exists(x):
+ return x is not None
+
+def default(val, d):
+ if exists(val):
+ return val
+ return d() if callable(d) else d
+
+def identity(t, *args, **kwargs):
+ return t
+
+def cycle(dl):
+ while True:
+ for data in dl:
+ yield data
+
+def has_int_squareroot(num):
+ return (math.sqrt(num) ** 2) == num
+
+def num_to_groups(num, divisor):
+ groups = num // divisor
+ remainder = num % divisor
+ arr = [divisor] * groups
+ if remainder > 0:
+ arr.append(remainder)
+ return arr
+
+def convert_image_to_fn(img_type, image):
+ if image.mode != img_type:
+ return image.convert(img_type)
+ return image
+
+# normalization functions
+
+def normalize_to_neg_one_to_one(img):
+ return img * 2 - 1
+
+def unnormalize_to_zero_to_one(t):
+ return (t + 1) * 0.5
+
+def dict2str(dict):
+ s = ''
+ for k, v in dict.items():
+ s += "{}: {:.5f}, ".format(k, v)
+ return s
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/vgg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..44bb256a66ab26278225774aeebbf25cd65ed068
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/vgg.py
@@ -0,0 +1,517 @@
+from functools import partial
+from typing import Union, List, Dict, Any, Optional, cast
+
+import torch
+import torch.nn as nn
+
+from torchvision.transforms._presets import ImageClassification
+from torchvision.utils import _log_api_usage_once
+from torchvision.models._api import WeightsEnum, Weights
+from torchvision.models._meta import _IMAGENET_CATEGORIES
+from torchvision.models._utils import handle_legacy_interface, _ovewrite_named_param
+
+
+__all__ = [
+ "VGG",
+ "VGG11_Weights",
+ "VGG11_BN_Weights",
+ "VGG13_Weights",
+ "VGG13_BN_Weights",
+ "VGG16_Weights",
+ "VGG16_BN_Weights",
+ "VGG19_Weights",
+ "VGG19_BN_Weights",
+ "vgg11",
+ "vgg11_bn",
+ "vgg13",
+ "vgg13_bn",
+ "vgg16",
+ "vgg16_bn",
+ "vgg19",
+ "vgg19_bn",
+]
+
+
+class VGG(nn.Module):
+ def __init__(
+ self, features: nn.Module, num_classes: int = 1000, init_weights: bool = True, dropout: float = 0.5
+ ) -> None:
+ super().__init__()
+ _log_api_usage_once(self)
+ self.features = features
+ self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+ self.classifier = nn.Sequential(
+ nn.Linear(512 * 7 * 7, 4096),
+ nn.ReLU(True),
+ nn.Dropout(p=dropout),
+ nn.Linear(4096, 4096),
+ nn.ReLU(True),
+ nn.Dropout(p=dropout),
+ nn.Linear(4096, num_classes),
+ )
+ if init_weights:
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ nn.init.normal_(m.weight, 0, 0.01)
+ nn.init.constant_(m.bias, 0)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ feats = []
+ # x = self.features(x)
+ # x = self.avgpool(x)
+ # x = torch.flatten(x, 1)
+ # x = self.classifier(x)
+ for i, layer in enumerate(self.features):
+ x = layer(x)
+ if i in [9, 16, 23, 30]:
+ feats.append(x)
+ return feats
+
+
+def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
+ layers: List[nn.Module] = []
+ in_channels = 3
+ for v in cfg:
+ if v == "M":
+ layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+ else:
+ v = cast(int, v)
+ conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+ if batch_norm:
+ layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+ else:
+ layers += [conv2d, nn.ReLU(inplace=True)]
+ in_channels = v
+ return nn.ModuleList(layers)
+
+
+cfgs: Dict[str, List[Union[str, int]]] = {
+ "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+ "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+ "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
+ "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512, "M"],
+}
+
+
+def _vgg(cfg: str, batch_norm: bool, weights: Optional[WeightsEnum], progress: bool, **kwargs: Any) -> VGG:
+ if weights is not None:
+ kwargs["init_weights"] = False
+ if weights.meta["categories"] is not None:
+ _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+ model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
+
+ if weights is not None:
+ ckpt1 = weights.get_state_dict(progress=progress)
+ ckpt2 = model.state_dict()
+ kl1 = list(ckpt1.keys())
+ for i, k in enumerate(list(ckpt2.keys())):
+ ckpt2[k] = ckpt1[kl1[i]]
+ msg = model.load_state_dict(ckpt2, strict=False)
+ print(f'Load VGG: {msg}')
+ else:
+ print('No pretrained weight loaded!')
+ return model
+
+
+_COMMON_META = {
+ "min_size": (32, 32),
+ "categories": _IMAGENET_CATEGORIES,
+ "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#alexnet-and-vgg",
+ "_docs": """These weights were trained from scratch by using a simplified training recipe.""",
+}
+
+
+class VGG11_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg11-8a719046.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 132863336,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 69.020,
+ "acc@5": 88.628,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG11_BN_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 132868840,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 70.370,
+ "acc@5": 89.810,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG13_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg13-19584684.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 133047848,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 69.928,
+ "acc@5": 89.246,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG13_BN_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 133053736,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 71.586,
+ "acc@5": 90.374,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG16_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg16-397923af.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 138357544,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 71.592,
+ "acc@5": 90.382,
+ }
+ },
+ },
+ )
+ IMAGENET1K_FEATURES = Weights(
+ # Weights ported from https://github.com/amdegroot/ssd.pytorch/
+ url="https://download.pytorch.org/models/vgg16_features-amdegroot-88682ab5.pth",
+ transforms=partial(
+ ImageClassification,
+ crop_size=224,
+ mean=(0.48235, 0.45882, 0.40784),
+ std=(1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0),
+ ),
+ meta={
+ **_COMMON_META,
+ "num_params": 138357544,
+ "categories": None,
+ "recipe": "https://github.com/amdegroot/ssd.pytorch#training-ssd",
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": float("nan"),
+ "acc@5": float("nan"),
+ }
+ },
+ "_docs": """
+ These weights can't be used for classification because they are missing values in the `classifier`
+ module. Only the `features` module has valid values and can be used for feature extraction. The weights
+ were trained using the original input standardization method as described in the paper.
+ """,
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG16_BN_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 138365992,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 73.360,
+ "acc@5": 91.516,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG19_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 143667240,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 72.376,
+ "acc@5": 90.876,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+class VGG19_BN_Weights(WeightsEnum):
+ IMAGENET1K_V1 = Weights(
+ url="https://download.pytorch.org/models/vgg19_bn-c79401a0.pth",
+ transforms=partial(ImageClassification, crop_size=224),
+ meta={
+ **_COMMON_META,
+ "num_params": 143678248,
+ "_metrics": {
+ "ImageNet-1K": {
+ "acc@1": 74.218,
+ "acc@5": 91.842,
+ }
+ },
+ },
+ )
+ DEFAULT = IMAGENET1K_V1
+
+
+@handle_legacy_interface(weights=("pretrained", VGG11_Weights.IMAGENET1K_V1))
+def vgg11(*, weights: Optional[VGG11_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-11 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG11_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG11_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG11_Weights
+ :members:
+ """
+ weights = VGG11_Weights.verify(weights)
+
+ return _vgg("A", False, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG11_BN_Weights.IMAGENET1K_V1))
+def vgg11_bn(*, weights: Optional[VGG11_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-11-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG11_BN_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG11_BN_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG11_BN_Weights
+ :members:
+ """
+ weights = VGG11_BN_Weights.verify(weights)
+
+ return _vgg("A", True, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG13_Weights.IMAGENET1K_V1))
+def vgg13(*, weights: Optional[VGG13_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-13 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG13_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG13_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG13_Weights
+ :members:
+ """
+ weights = VGG13_Weights.verify(weights)
+
+ return _vgg("B", False, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG13_BN_Weights.IMAGENET1K_V1))
+def vgg13_bn(*, weights: Optional[VGG13_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-13-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG13_BN_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG13_BN_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG13_BN_Weights
+ :members:
+ """
+ weights = VGG13_BN_Weights.verify(weights)
+
+ return _vgg("B", True, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG16_Weights.IMAGENET1K_V1))
+def vgg16(*, weights: Optional[VGG16_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-16 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG16_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG16_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG16_Weights
+ :members:
+ """
+ weights = VGG16_Weights.verify(weights)
+
+ return _vgg("D", False, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG16_BN_Weights.IMAGENET1K_V1))
+def vgg16_bn(*, weights: Optional[VGG16_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-16-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG16_BN_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG16_BN_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG16_BN_Weights
+ :members:
+ """
+ weights = VGG16_BN_Weights.verify(weights)
+
+ return _vgg("D", True, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG19_Weights.IMAGENET1K_V1))
+def vgg19(*, weights: Optional[VGG19_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-19 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG19_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG19_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG19_Weights
+ :members:
+ """
+ weights = VGG19_Weights.verify(weights)
+
+ return _vgg("E", False, weights, progress, **kwargs)
+
+
+@handle_legacy_interface(weights=("pretrained", VGG19_BN_Weights.IMAGENET1K_V1))
+def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
+ """VGG-19_BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__.
+
+ Args:
+ weights (:class:`~torchvision.models.VGG19_BN_Weights`, optional): The
+ pretrained weights to use. See
+ :class:`~torchvision.models.VGG19_BN_Weights` below for
+ more details, and possible values. By default, no pre-trained
+ weights are used.
+ progress (bool, optional): If True, displays a progress bar of the
+ download to stderr. Default is True.
+ **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+ base class. Please refer to the `source code
+ `_
+ for more details about this class.
+
+ .. autoclass:: torchvision.models.VGG19_BN_Weights
+ :members:
+ """
+ weights = VGG19_BN_Weights.verify(weights)
+
+ return _vgg("E", True, weights, progress, **kwargs)
+
+
+# The dictionary below is internal implementation detail and will be removed in v0.15
+from torchvision.models._utils import _ModelURLs
+
+
+model_urls = _ModelURLs(
+ {
+ "vgg11": VGG11_Weights.IMAGENET1K_V1.url,
+ "vgg13": VGG13_Weights.IMAGENET1K_V1.url,
+ "vgg16": VGG16_Weights.IMAGENET1K_V1.url,
+ "vgg19": VGG19_Weights.IMAGENET1K_V1.url,
+ "vgg11_bn": VGG11_BN_Weights.IMAGENET1K_V1.url,
+ "vgg13_bn": VGG13_BN_Weights.IMAGENET1K_V1.url,
+ "vgg16_bn": VGG16_BN_Weights.IMAGENET1K_V1.url,
+ "vgg19_bn": VGG19_BN_Weights.IMAGENET1K_V1.url,
+ }
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/wavelet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/wavelet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f08a3f1392937d0f67a51de4b071dd49cf5310ea
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/wavelet.py
@@ -0,0 +1,83 @@
+import pywt
+import pywt.data
+import torch
+from torch import nn
+from torch.autograd import Function
+import torch.nn.functional as F
+
+
+def create_wavelet_filter(wave, in_size, out_size, type=torch.float):
+ w = pywt.Wavelet(wave)
+ dec_hi = torch.tensor(w.dec_hi[::-1], dtype=type)
+ dec_lo = torch.tensor(w.dec_lo[::-1], dtype=type)
+ dec_filters = torch.stack([dec_lo.unsqueeze(0) * dec_lo.unsqueeze(1),
+ dec_lo.unsqueeze(0) * dec_hi.unsqueeze(1),
+ dec_hi.unsqueeze(0) * dec_lo.unsqueeze(1),
+ dec_hi.unsqueeze(0) * dec_hi.unsqueeze(1)], dim=0)
+
+ dec_filters = dec_filters[:, None].repeat(in_size, 1, 1, 1)
+
+ rec_hi = torch.tensor(w.rec_hi[::-1], dtype=type).flip(dims=[0])
+ rec_lo = torch.tensor(w.rec_lo[::-1], dtype=type).flip(dims=[0])
+ rec_filters = torch.stack([rec_lo.unsqueeze(0) * rec_lo.unsqueeze(1),
+ rec_lo.unsqueeze(0) * rec_hi.unsqueeze(1),
+ rec_hi.unsqueeze(0) * rec_lo.unsqueeze(1),
+ rec_hi.unsqueeze(0) * rec_hi.unsqueeze(1)], dim=0)
+
+ rec_filters = rec_filters[:, None].repeat(out_size, 1, 1, 1)
+
+ return dec_filters, rec_filters
+
+
+def wt(x, filters, in_size, level):
+ _, _, h, w = x.shape
+ pad = (filters.shape[2] // 2 - 1, filters.shape[3] // 2 - 1)
+ res = F.conv2d(x, filters, stride=2, groups=in_size, padding=pad)
+ if level > 1:
+ res[:, ::4] = wt(res[:, ::4], filters, in_size, level - 1)
+ res = res.reshape(-1, 2, h // 2, w // 2).transpose(1, 2).reshape(-1, in_size, h, w)
+ return res
+
+
+def iwt(x, inv_filters, in_size, level):
+ _, _, h, w = x.shape
+ pad = (inv_filters.shape[2] // 2 - 1, inv_filters.shape[3] // 2 - 1)
+ res = x.reshape(-1, h // 2, 2, w // 2).transpose(1, 2).reshape(-1, 4 * in_size, h // 2, w // 2)
+ if level > 1:
+ res[:, ::4] = iwt(res[:, ::4], inv_filters, in_size, level - 1)
+ res = F.conv_transpose2d(res, inv_filters, stride=2, groups=in_size, padding=pad)
+ return res
+
+
+def get_inverse_transform(weights, in_size, level):
+ class InverseWaveletTransform(Function):
+
+ @staticmethod
+ def forward(ctx, input):
+ with torch.no_grad():
+ x = iwt(input, weights, in_size, level)
+ return x
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ grad = wt(grad_output, weights, in_size, level)
+ return grad, None
+
+ return InverseWaveletTransform().apply
+
+
+def get_transform(weights, in_size, level):
+ class WaveletTransform(Function):
+
+ @staticmethod
+ def forward(ctx, input):
+ with torch.no_grad():
+ x = wt(input, weights, in_size, level)
+ return x
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ grad = iwt(grad_output, weights, in_size, level)
+ return grad, None
+
+ return WaveletTransform().apply
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/wcc.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/wcc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a88019348f62db34e651c03d89c1ba817daef6a6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/denoising_diffusion_pytorch/wcc.py
@@ -0,0 +1,101 @@
+from typing import Union, Tuple
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.quantization import weight_quantize_fn, act_quantize_fn
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch import wavelet
+
+
+class WCC(nn.Conv1d):
+ def __init__(self, in_channels: int,
+ out_channels: int,
+ stride: Union[int, Tuple] = 1,
+ padding: Union[int, Tuple] = 0,
+ dilation: Union[int, Tuple] = 1,
+ groups: int = 1,
+ bias: bool = False,
+ levels: int = 3,
+ compress_rate: float = 0.25,
+ bit_w: int = 8,
+ bit_a: int = 8,
+ wt_type: str = "db1"):
+ super(WCC, self).__init__(in_channels, out_channels, 1, stride, padding, dilation, groups, bias)
+ self.layer_type = 'WCC'
+ self.bit_w = bit_w
+ self.bit_a = bit_a
+
+ self.weight_quant = weight_quantize_fn(self.bit_w)
+ self.act_quant = act_quantize_fn(self.bit_a, signed=True)
+
+ self.levels = levels
+ self.wt_type = wt_type
+ self.compress_rate = compress_rate
+
+ dec_filters, rec_filters = wavelet.create_wavelet_filter(wave=self.wt_type,
+ in_size=in_channels,
+ out_size=out_channels)
+ self.wt_filters = nn.Parameter(dec_filters, requires_grad=False)
+ self.iwt_filters = nn.Parameter(rec_filters, requires_grad=False)
+ self.wt = wavelet.get_transform(self.wt_filters, in_channels, levels)
+ self.iwt = wavelet.get_inverse_transform(self.iwt_filters, out_channels, levels)
+
+ self.get_pad = lambda n: ((2 ** levels) - n) % (2 ** levels)
+
+ def forward(self, x):
+ in_shape = x.shape
+ pads = (0, self.get_pad(in_shape[2]), 0, self.get_pad(in_shape[3]))
+ x = F.pad(x, pads) # pad to match 2^(levels)
+
+ weight_q = self.weight_quant(self.weight) # quantize weights
+ x = self.wt(x) # H
+ topk, ids = self.compress(x) # T
+ topk_q = self.act_quant(topk) # quantize activations
+ topk_q = F.conv1d(topk_q, weight_q, self.bias, self.stride, self.padding, self.dilation, self.groups) # K_1x1
+ x = self.decompress(topk_q, ids, x.shape) # T^T
+ x = self.iwt(x) # H^T
+
+ x = x[:, :, :in_shape[2], :in_shape[3]] # remove pads
+ return x
+
+ def compress(self, x):
+ b, c, h, w = x.shape
+ acc = x.norm(dim=1).pow(2)
+ acc = acc.view(b, h * w)
+ k = int(h * w * self.compress_rate)
+ ids = acc.topk(k, dim=1, sorted=False)[1]
+ ids.unsqueeze_(dim=1)
+ topk = x.reshape((b, c, h * w)).gather(dim=2, index=ids.repeat(1, c, 1))
+ return topk, ids
+
+ def decompress(self, topk, ids, shape):
+ b, _, h, w = shape
+ ids = ids.repeat(1, self.out_channels, 1)
+ x = torch.zeros(size=(b, self.out_channels, h * w), requires_grad=True, device=topk.device)
+ x = x.scatter(dim=2, index=ids, src=topk)
+ x = x.reshape((b, self.out_channels, h, w))
+ return x
+
+ def change_wt_params(self, compress_rate, levels, wt_type="db1"):
+ self.compress_rate = compress_rate
+ self.levels = levels
+ dec_filters, rec_filters = wavelet.create_wavelet_filter(wave=self.wt_type,
+ in_size=self.in_channels,
+ out_size=self.out_channels)
+ self.wt_filters = nn.Parameter(dec_filters, requires_grad=False)
+ self.iwt_filters = nn.Parameter(rec_filters, requires_grad=False)
+ self.wt = wavelet.get_transform(self.wt_filters, self.in_channels, levels)
+ self.iwt = wavelet.get_inverse_transform(self.iwt_filters, self.out_channels, levels)
+
+ def change_bit(self, bit_w, bit_a):
+ self.bit_w = bit_w
+ self.bit_a = bit_a
+ self.weight_quant.change_bit(bit_w)
+ self.act_quant.change_bit(bit_a)
+
+if __name__ == '__main__':
+ wcc = WCC(80, 80)
+ x = torch.rand(1, 80, 80, 80)
+ y = wcc(x)
+ pause = 0
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e72ab043daabf0ddf971e5350989f3f582fba0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/model.py
@@ -0,0 +1,197 @@
+import numpy as np
+import yaml
+import argparse
+import math
+import torch
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.utils import *
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.encoder_decoder import AutoencoderKL
+# from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.transmodel import TransModel
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.uncond_unet import Unet
+from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.data import *
+from fvcore.common.config import CfgNode
+from pathlib import Path
+
+def load_conf(config_file, conf={}):
+ with open(config_file) as f:
+ exp_conf = yaml.load(f, Loader=yaml.FullLoader)
+ for k, v in exp_conf.items():
+ conf[k] = v
+ return conf
+
+def prepare_args(ckpt_path, sampling_timesteps=1):
+ return argparse.Namespace(
+ cfg=load_conf(Path(__file__).parent / "default.yaml"),
+ pre_weight=ckpt_path,
+ sampling_timesteps=sampling_timesteps
+ )
+
+class DiffusionEdge:
+ def __init__(self, args) -> None:
+ self.cfg = CfgNode(args.cfg)
+ torch.manual_seed(42)
+ np.random.seed(42)
+ model_cfg = self.cfg.model
+ first_stage_cfg = model_cfg.first_stage
+ first_stage_model = AutoencoderKL(
+ ddconfig=first_stage_cfg.ddconfig,
+ lossconfig=first_stage_cfg.lossconfig,
+ embed_dim=first_stage_cfg.embed_dim,
+ ckpt_path=first_stage_cfg.ckpt_path,
+ )
+ if model_cfg.model_name == 'cond_unet':
+ from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.mask_cond_unet import Unet
+ unet_cfg = model_cfg.unet
+ unet = Unet(dim=unet_cfg.dim,
+ channels=unet_cfg.channels,
+ dim_mults=unet_cfg.dim_mults,
+ learned_variance=unet_cfg.get('learned_variance', False),
+ out_mul=unet_cfg.out_mul,
+ cond_in_dim=unet_cfg.cond_in_dim,
+ cond_dim=unet_cfg.cond_dim,
+ cond_dim_mults=unet_cfg.cond_dim_mults,
+ window_sizes1=unet_cfg.window_sizes1,
+ window_sizes2=unet_cfg.window_sizes2,
+ fourier_scale=unet_cfg.fourier_scale,
+ cfg=unet_cfg,
+ )
+ else:
+ raise NotImplementedError
+ if model_cfg.model_type == 'const_sde':
+ from custom_controlnet_aux.diffusion_edge.denoising_diffusion_pytorch.ddm_const_sde import LatentDiffusion
+ else:
+ raise NotImplementedError(f'{model_cfg.model_type} is not surportted !')
+
+ self.model = LatentDiffusion(
+ model=unet,
+ auto_encoder=first_stage_model,
+ train_sample=model_cfg.train_sample,
+ image_size=model_cfg.image_size,
+ timesteps=model_cfg.timesteps,
+ sampling_timesteps=args.sampling_timesteps,
+ loss_type=model_cfg.loss_type,
+ objective=model_cfg.objective,
+ scale_factor=model_cfg.scale_factor,
+ scale_by_std=model_cfg.scale_by_std,
+ scale_by_softsign=model_cfg.scale_by_softsign,
+ default_scale=model_cfg.get('default_scale', False),
+ input_keys=model_cfg.input_keys,
+ ckpt_path=model_cfg.ckpt_path,
+ ignore_keys=model_cfg.ignore_keys,
+ only_model=model_cfg.only_model,
+ start_dist=model_cfg.start_dist,
+ perceptual_weight=model_cfg.perceptual_weight,
+ use_l1=model_cfg.get('use_l1', True),
+ cfg=model_cfg,
+ )
+ self.cfg.sampler.ckpt_path = args.pre_weight
+
+ data = torch.load(self.cfg.sampler.ckpt_path, map_location="cpu")
+ if self.cfg.sampler.use_ema:
+ sd = data['ema']
+ new_sd = {}
+ for k in sd.keys():
+ if k.startswith("ema_model."):
+ new_k = k[10:] # remove ema_model.
+ new_sd[new_k] = sd[k]
+ sd = new_sd
+ self.model.load_state_dict(sd)
+ else:
+ self.model.load_state_dict(data['model'])
+ if 'scale_factor' in data['model']:
+ self.model.scale_factor = data['model']['scale_factor']
+
+ self.model.eval()
+ self.device = "cpu"
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, image, batch_size=8):
+ image = normalize_to_neg_one_to_one(image).to(self.device)
+ mask = None
+ if self.cfg.sampler.sample_type == 'whole':
+ return self.whole_sample(image, raw_size=image.shape[2:], mask=mask)
+ elif self.cfg.sampler.sample_type == 'slide':
+ return self.slide_sample(image, crop_size=self.cfg.sampler.get('crop_size', [320, 320]),
+ stride=self.cfg.sampler.stride, mask=mask, bs=batch_size)
+
+ def whole_sample(self, inputs, raw_size, mask=None):
+ inputs = F.interpolate(inputs, size=(416, 416), mode='bilinear', align_corners=True)
+ seg_logits = self.model.sample(batch_size=inputs.shape[0], cond=inputs, mask=mask)
+ seg_logits = F.interpolate(seg_logits, size=raw_size, mode='bilinear', align_corners=True)
+ return seg_logits
+
+ def slide_sample(self, inputs, crop_size, stride, mask=None, bs=8):
+ """Inference by sliding-window with overlap.
+
+ If h_crop > h_img or w_crop > w_img, the small patch will be used to
+ decode without padding.
+
+ Args:
+ inputs (tensor): the tensor should have a shape NxCxHxW,
+ which contains all images in the batch.
+ batch_img_metas (List[dict]): List of image metainfo where each may
+ also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+ 'ori_shape', and 'pad_shape'.
+ For details on the values of these keys see
+ `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+ Returns:
+ Tensor: The segmentation results, seg_logits from model of each
+ input image.
+ """
+
+ h_stride, w_stride = stride
+ h_crop, w_crop = crop_size
+ batch_size, _, h_img, w_img = inputs.size()
+ out_channels = 1
+ h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+ w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+ preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+ # aux_out1 = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+ # aux_out2 = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+ count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+ crop_imgs = []
+ x1s = []
+ x2s = []
+ y1s = []
+ y2s = []
+ for h_idx in range(h_grids):
+ for w_idx in range(w_grids):
+ y1 = h_idx * h_stride
+ x1 = w_idx * w_stride
+ y2 = min(y1 + h_crop, h_img)
+ x2 = min(x1 + w_crop, w_img)
+ y1 = max(y2 - h_crop, 0)
+ x1 = max(x2 - w_crop, 0)
+ crop_img = inputs[:, :, y1:y2, x1:x2]
+ crop_imgs.append(crop_img)
+ x1s.append(x1)
+ x2s.append(x2)
+ y1s.append(y1)
+ y2s.append(y2)
+ crop_imgs = torch.cat(crop_imgs, dim=0)
+ crop_seg_logits_list = []
+ num_windows = crop_imgs.shape[0]
+ bs = bs
+ length = math.ceil(num_windows / bs)
+ for i in range(length):
+ if i == length - 1:
+ crop_imgs_temp = crop_imgs[bs * i:num_windows, ...]
+ else:
+ crop_imgs_temp = crop_imgs[bs * i:bs * (i + 1), ...]
+
+ crop_seg_logits = self.model.sample(batch_size=crop_imgs_temp.shape[0], cond=crop_imgs_temp, mask=mask)
+ crop_seg_logits_list.append(crop_seg_logits)
+ crop_seg_logits = torch.cat(crop_seg_logits_list, dim=0)
+ for crop_seg_logit, x1, x2, y1, y2 in zip(crop_seg_logits, x1s, x2s, y1s, y2s):
+ preds += F.pad(crop_seg_logit,
+ (int(x1), int(preds.shape[3] - x2), int(y1),
+ int(preds.shape[2] - y2)))
+ count_mat[:, :, y1:y2, x1:x2] += 1
+
+ assert (count_mat == 0).sum() == 0
+ seg_logits = preds / count_mat
+ return seg_logits
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/requirement.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4896418c16e8daee90887e79b7926ca7d0cd3f63
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/requirement.txt
@@ -0,0 +1,9 @@
+#torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
+einops
+scikit-learn
+scipy
+tensorboard
+fvcore
+albumentations
+omegaconf
+numpy==1.23.5
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/ade20k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..397c1ce0b4cc6a927e369e272191f06deb550639
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/ade20k.py
@@ -0,0 +1,124 @@
+import os
+import numpy as np
+import cv2
+import custom_albumentations as albumentations
+from PIL import Image
+from torch.utils.data import Dataset
+
+from custom_controlnet_aux.diffusion_edge.taming.data.sflckr import SegmentationBase # for examples included in repo
+
+
+class Examples(SegmentationBase):
+ def __init__(self, size=256, random_crop=False, interpolation="bicubic"):
+ super().__init__(data_csv="data/ade20k_examples.txt",
+ data_root="data/ade20k_images",
+ segmentation_root="data/ade20k_segmentations",
+ size=size, random_crop=random_crop,
+ interpolation=interpolation,
+ n_labels=151, shift_segmentation=False)
+
+
+# With semantic map and scene label
+class ADE20kBase(Dataset):
+ def __init__(self, config=None, size=None, random_crop=False, interpolation="bicubic", crop_size=None):
+ self.split = self.get_split()
+ self.n_labels = 151 # unknown + 150
+ self.data_csv = {"train": "data/ade20k_train.txt",
+ "validation": "data/ade20k_test.txt"}[self.split]
+ self.data_root = "data/ade20k_root"
+ with open(os.path.join(self.data_root, "sceneCategories.txt"), "r") as f:
+ self.scene_categories = f.read().splitlines()
+ self.scene_categories = dict(line.split() for line in self.scene_categories)
+ with open(self.data_csv, "r") as f:
+ self.image_paths = f.read().splitlines()
+ self._length = len(self.image_paths)
+ self.labels = {
+ "relative_file_path_": [l for l in self.image_paths],
+ "file_path_": [os.path.join(self.data_root, "images", l)
+ for l in self.image_paths],
+ "relative_segmentation_path_": [l.replace(".jpg", ".png")
+ for l in self.image_paths],
+ "segmentation_path_": [os.path.join(self.data_root, "annotations",
+ l.replace(".jpg", ".png"))
+ for l in self.image_paths],
+ "scene_category": [self.scene_categories[l.split("/")[1].replace(".jpg", "")]
+ for l in self.image_paths],
+ }
+
+ size = None if size is not None and size<=0 else size
+ self.size = size
+ if crop_size is None:
+ self.crop_size = size if size is not None else None
+ else:
+ self.crop_size = crop_size
+ if self.size is not None:
+ self.interpolation = interpolation
+ self.interpolation = {
+ "nearest": cv2.INTER_NEAREST,
+ "bilinear": cv2.INTER_LINEAR,
+ "bicubic": cv2.INTER_CUBIC,
+ "area": cv2.INTER_AREA,
+ "lanczos": cv2.INTER_LANCZOS4}[self.interpolation]
+ self.image_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+ interpolation=self.interpolation)
+ self.segmentation_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+ interpolation=cv2.INTER_NEAREST)
+
+ if crop_size is not None:
+ self.center_crop = not random_crop
+ if self.center_crop:
+ self.cropper = albumentations.CenterCrop(height=self.crop_size, width=self.crop_size)
+ else:
+ self.cropper = albumentations.RandomCrop(height=self.crop_size, width=self.crop_size)
+ self.preprocessor = self.cropper
+
+ def __len__(self):
+ return self._length
+
+ def __getitem__(self, i):
+ example = dict((k, self.labels[k][i]) for k in self.labels)
+ image = Image.open(example["file_path_"])
+ if not image.mode == "RGB":
+ image = image.convert("RGB")
+ image = np.array(image).astype(np.uint8)
+ if self.size is not None:
+ image = self.image_rescaler(image=image)["image"]
+ segmentation = Image.open(example["segmentation_path_"])
+ segmentation = np.array(segmentation).astype(np.uint8)
+ if self.size is not None:
+ segmentation = self.segmentation_rescaler(image=segmentation)["image"]
+ if self.size is not None:
+ processed = self.preprocessor(image=image, mask=segmentation)
+ else:
+ processed = {"image": image, "mask": segmentation}
+ example["image"] = (processed["image"]/127.5 - 1.0).astype(np.float32)
+ segmentation = processed["mask"]
+ onehot = np.eye(self.n_labels)[segmentation]
+ example["segmentation"] = onehot
+ return example
+
+
+class ADE20kTrain(ADE20kBase):
+ # default to random_crop=True
+ def __init__(self, config=None, size=None, random_crop=True, interpolation="bicubic", crop_size=None):
+ super().__init__(config=config, size=size, random_crop=random_crop,
+ interpolation=interpolation, crop_size=crop_size)
+
+ def get_split(self):
+ return "train"
+
+
+class ADE20kValidation(ADE20kBase):
+ def get_split(self):
+ return "validation"
+
+
+if __name__ == "__main__":
+ dset = ADE20kValidation()
+ ex = dset[0]
+ for k in ["image", "scene_category", "segmentation"]:
+ print(type(ex[k]))
+ try:
+ print(ex[k].shape)
+ except:
+ print(ex[k])
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_coco.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5060ce11676e77457b63a47605434c43b290b717
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_coco.py
@@ -0,0 +1,139 @@
+import json
+from itertools import chain
+from pathlib import Path
+from typing import Iterable, Dict, List, Callable, Any
+from collections import defaultdict
+
+from tqdm import tqdm
+
+from custom_controlnet_aux.diffusion_edge.taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import Annotation, ImageDescription, Category
+
+COCO_PATH_STRUCTURE = {
+ 'train': {
+ 'top_level': '',
+ 'instances_annotations': 'annotations/instances_train2017.json',
+ 'stuff_annotations': 'annotations/stuff_train2017.json',
+ 'files': 'train2017'
+ },
+ 'validation': {
+ 'top_level': '',
+ 'instances_annotations': 'annotations/instances_val2017.json',
+ 'stuff_annotations': 'annotations/stuff_val2017.json',
+ 'files': 'val2017'
+ }
+}
+
+
+def load_image_descriptions(description_json: List[Dict]) -> Dict[str, ImageDescription]:
+ return {
+ str(img['id']): ImageDescription(
+ id=img['id'],
+ license=img.get('license'),
+ file_name=img['file_name'],
+ coco_url=img['coco_url'],
+ original_size=(img['width'], img['height']),
+ date_captured=img.get('date_captured'),
+ flickr_url=img.get('flickr_url')
+ )
+ for img in description_json
+ }
+
+
+def load_categories(category_json: Iterable) -> Dict[str, Category]:
+ return {str(cat['id']): Category(id=str(cat['id']), super_category=cat['supercategory'], name=cat['name'])
+ for cat in category_json if cat['name'] != 'other'}
+
+
+def load_annotations(annotations_json: List[Dict], image_descriptions: Dict[str, ImageDescription],
+ category_no_for_id: Callable[[str], int], split: str) -> Dict[str, List[Annotation]]:
+ annotations = defaultdict(list)
+ total = sum(len(a) for a in annotations_json)
+ for ann in tqdm(chain(*annotations_json), f'Loading {split} annotations', total=total):
+ image_id = str(ann['image_id'])
+ if image_id not in image_descriptions:
+ raise ValueError(f'image_id [{image_id}] has no image description.')
+ category_id = ann['category_id']
+ try:
+ category_no = category_no_for_id(str(category_id))
+ except KeyError:
+ continue
+
+ width, height = image_descriptions[image_id].original_size
+ bbox = (ann['bbox'][0] / width, ann['bbox'][1] / height, ann['bbox'][2] / width, ann['bbox'][3] / height)
+
+ annotations[image_id].append(
+ Annotation(
+ id=ann['id'],
+ area=bbox[2]*bbox[3], # use bbox area
+ is_group_of=ann['iscrowd'],
+ image_id=ann['image_id'],
+ bbox=bbox,
+ category_id=str(category_id),
+ category_no=category_no
+ )
+ )
+ return dict(annotations)
+
+
+class AnnotatedObjectsCoco(AnnotatedObjectsDataset):
+ def __init__(self, use_things: bool = True, use_stuff: bool = True, **kwargs):
+ """
+ @param data_path: is the path to the following folder structure:
+ coco/
+ ├── annotations
+ │ ├── instances_train2017.json
+ │ ├── instances_val2017.json
+ │ ├── stuff_train2017.json
+ │ └── stuff_val2017.json
+ ├── train2017
+ │ ├── 000000000009.jpg
+ │ ├── 000000000025.jpg
+ │ └── ...
+ ├── val2017
+ │ ├── 000000000139.jpg
+ │ ├── 000000000285.jpg
+ │ └── ...
+ @param: split: one of 'train' or 'validation'
+ @param: desired image size (give square images)
+ """
+ super().__init__(**kwargs)
+ self.use_things = use_things
+ self.use_stuff = use_stuff
+
+ with open(self.paths['instances_annotations']) as f:
+ inst_data_json = json.load(f)
+ with open(self.paths['stuff_annotations']) as f:
+ stuff_data_json = json.load(f)
+
+ category_jsons = []
+ annotation_jsons = []
+ if self.use_things:
+ category_jsons.append(inst_data_json['categories'])
+ annotation_jsons.append(inst_data_json['annotations'])
+ if self.use_stuff:
+ category_jsons.append(stuff_data_json['categories'])
+ annotation_jsons.append(stuff_data_json['annotations'])
+
+ self.categories = load_categories(chain(*category_jsons))
+ self.filter_categories()
+ self.setup_category_id_and_number()
+
+ self.image_descriptions = load_image_descriptions(inst_data_json['images'])
+ annotations = load_annotations(annotation_jsons, self.image_descriptions, self.get_category_number, self.split)
+ self.annotations = self.filter_object_number(annotations, self.min_object_area,
+ self.min_objects_per_image, self.max_objects_per_image)
+ self.image_ids = list(self.annotations.keys())
+ self.clean_up_annotations_and_image_descriptions()
+
+ def get_path_structure(self) -> Dict[str, str]:
+ if self.split not in COCO_PATH_STRUCTURE:
+ raise ValueError(f'Split [{self.split} does not exist for COCO data.]')
+ return COCO_PATH_STRUCTURE[self.split]
+
+ def get_image_path(self, image_id: str) -> Path:
+ return self.paths['files'].joinpath(self.image_descriptions[str(image_id)].file_name)
+
+ def get_image_description(self, image_id: str) -> Dict[str, Any]:
+ # noinspection PyProtectedMember
+ return self.image_descriptions[image_id]._asdict()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_dataset.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..606ffd689d8e151824e5365565df920c06e81744
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_dataset.py
@@ -0,0 +1,218 @@
+from pathlib import Path
+from typing import Optional, List, Callable, Dict, Any, Union
+import warnings
+
+import PIL.Image as pil_image
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+from custom_controlnet_aux.diffusion_edge.taming.data.conditional_builder.objects_bbox import ObjectsBoundingBoxConditionalBuilder
+from custom_controlnet_aux.diffusion_edge.taming.data.conditional_builder.objects_center_points import ObjectsCenterPointsConditionalBuilder
+from custom_controlnet_aux.diffusion_edge.taming.data.conditional_builder.utils import load_object_from_string
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import BoundingBox, CropMethodType, Image, Annotation, SplitType
+from custom_controlnet_aux.diffusion_edge.taming.data.image_transforms import CenterCropReturnCoordinates, RandomCrop1dReturnCoordinates, \
+ Random2dCropReturnCoordinates, RandomHorizontalFlipReturn, convert_pil_to_tensor
+
+
+class AnnotatedObjectsDataset(Dataset):
+ def __init__(self, data_path: Union[str, Path], split: SplitType, keys: List[str], target_image_size: int,
+ min_object_area: float, min_objects_per_image: int, max_objects_per_image: int,
+ crop_method: CropMethodType, random_flip: bool, no_tokens: int, use_group_parameter: bool,
+ encode_crop: bool, category_allow_list_target: str = "", category_mapping_target: str = "",
+ no_object_classes: Optional[int] = None):
+ self.data_path = data_path
+ self.split = split
+ self.keys = keys
+ self.target_image_size = target_image_size
+ self.min_object_area = min_object_area
+ self.min_objects_per_image = min_objects_per_image
+ self.max_objects_per_image = max_objects_per_image
+ self.crop_method = crop_method
+ self.random_flip = random_flip
+ self.no_tokens = no_tokens
+ self.use_group_parameter = use_group_parameter
+ self.encode_crop = encode_crop
+
+ self.annotations = None
+ self.image_descriptions = None
+ self.categories = None
+ self.category_ids = None
+ self.category_number = None
+ self.image_ids = None
+ self.transform_functions: List[Callable] = self.setup_transform(target_image_size, crop_method, random_flip)
+ self.paths = self.build_paths(self.data_path)
+ self._conditional_builders = None
+ self.category_allow_list = None
+ if category_allow_list_target:
+ allow_list = load_object_from_string(category_allow_list_target)
+ self.category_allow_list = {name for name, _ in allow_list}
+ self.category_mapping = {}
+ if category_mapping_target:
+ self.category_mapping = load_object_from_string(category_mapping_target)
+ self.no_object_classes = no_object_classes
+
+ def build_paths(self, top_level: Union[str, Path]) -> Dict[str, Path]:
+ top_level = Path(top_level)
+ sub_paths = {name: top_level.joinpath(sub_path) for name, sub_path in self.get_path_structure().items()}
+ for path in sub_paths.values():
+ if not path.exists():
+ raise FileNotFoundError(f'{type(self).__name__} data structure error: [{path}] does not exist.')
+ return sub_paths
+
+ @staticmethod
+ def load_image_from_disk(path: Path) -> Image:
+ return pil_image.open(path).convert('RGB')
+
+ @staticmethod
+ def setup_transform(target_image_size: int, crop_method: CropMethodType, random_flip: bool):
+ transform_functions = []
+ if crop_method == 'none':
+ transform_functions.append(transforms.Resize((target_image_size, target_image_size)))
+ elif crop_method == 'center':
+ transform_functions.extend([
+ transforms.Resize(target_image_size),
+ CenterCropReturnCoordinates(target_image_size)
+ ])
+ elif crop_method == 'random-1d':
+ transform_functions.extend([
+ transforms.Resize(target_image_size),
+ RandomCrop1dReturnCoordinates(target_image_size)
+ ])
+ elif crop_method == 'random-2d':
+ transform_functions.extend([
+ Random2dCropReturnCoordinates(target_image_size),
+ transforms.Resize(target_image_size)
+ ])
+ elif crop_method is None:
+ return None
+ else:
+ raise ValueError(f'Received invalid crop method [{crop_method}].')
+ if random_flip:
+ transform_functions.append(RandomHorizontalFlipReturn())
+ transform_functions.append(transforms.Lambda(lambda x: x / 127.5 - 1.))
+ return transform_functions
+
+ def image_transform(self, x: Tensor) -> (Optional[BoundingBox], Optional[bool], Tensor):
+ crop_bbox = None
+ flipped = None
+ for t in self.transform_functions:
+ if isinstance(t, (RandomCrop1dReturnCoordinates, CenterCropReturnCoordinates, Random2dCropReturnCoordinates)):
+ crop_bbox, x = t(x)
+ elif isinstance(t, RandomHorizontalFlipReturn):
+ flipped, x = t(x)
+ else:
+ x = t(x)
+ return crop_bbox, flipped, x
+
+ @property
+ def no_classes(self) -> int:
+ return self.no_object_classes if self.no_object_classes else len(self.categories)
+
+ @property
+ def conditional_builders(self) -> ObjectsCenterPointsConditionalBuilder:
+ # cannot set this up in init because no_classes is only known after loading data in init of superclass
+ if self._conditional_builders is None:
+ self._conditional_builders = {
+ 'objects_center_points': ObjectsCenterPointsConditionalBuilder(
+ self.no_classes,
+ self.max_objects_per_image,
+ self.no_tokens,
+ self.encode_crop,
+ self.use_group_parameter,
+ getattr(self, 'use_additional_parameters', False)
+ ),
+ 'objects_bbox': ObjectsBoundingBoxConditionalBuilder(
+ self.no_classes,
+ self.max_objects_per_image,
+ self.no_tokens,
+ self.encode_crop,
+ self.use_group_parameter,
+ getattr(self, 'use_additional_parameters', False)
+ )
+ }
+ return self._conditional_builders
+
+ def filter_categories(self) -> None:
+ if self.category_allow_list:
+ self.categories = {id_: cat for id_, cat in self.categories.items() if cat.name in self.category_allow_list}
+ if self.category_mapping:
+ self.categories = {id_: cat for id_, cat in self.categories.items() if cat.id not in self.category_mapping}
+
+ def setup_category_id_and_number(self) -> None:
+ self.category_ids = list(self.categories.keys())
+ self.category_ids.sort()
+ if '/m/01s55n' in self.category_ids:
+ self.category_ids.remove('/m/01s55n')
+ self.category_ids.append('/m/01s55n')
+ self.category_number = {category_id: i for i, category_id in enumerate(self.category_ids)}
+ if self.category_allow_list is not None and self.category_mapping is None \
+ and len(self.category_ids) != len(self.category_allow_list):
+ warnings.warn('Unexpected number of categories: Mismatch with category_allow_list. '
+ 'Make sure all names in category_allow_list exist.')
+
+ def clean_up_annotations_and_image_descriptions(self) -> None:
+ image_id_set = set(self.image_ids)
+ self.annotations = {k: v for k, v in self.annotations.items() if k in image_id_set}
+ self.image_descriptions = {k: v for k, v in self.image_descriptions.items() if k in image_id_set}
+
+ @staticmethod
+ def filter_object_number(all_annotations: Dict[str, List[Annotation]], min_object_area: float,
+ min_objects_per_image: int, max_objects_per_image: int) -> Dict[str, List[Annotation]]:
+ filtered = {}
+ for image_id, annotations in all_annotations.items():
+ annotations_with_min_area = [a for a in annotations if a.area > min_object_area]
+ if min_objects_per_image <= len(annotations_with_min_area) <= max_objects_per_image:
+ filtered[image_id] = annotations_with_min_area
+ return filtered
+
+ def __len__(self):
+ return len(self.image_ids)
+
+ def __getitem__(self, n: int) -> Dict[str, Any]:
+ image_id = self.get_image_id(n)
+ sample = self.get_image_description(image_id)
+ sample['annotations'] = self.get_annotation(image_id)
+
+ if 'image' in self.keys:
+ sample['image_path'] = str(self.get_image_path(image_id))
+ sample['image'] = self.load_image_from_disk(sample['image_path'])
+ sample['image'] = convert_pil_to_tensor(sample['image'])
+ sample['crop_bbox'], sample['flipped'], sample['image'] = self.image_transform(sample['image'])
+ sample['image'] = sample['image'].permute(1, 2, 0)
+
+ for conditional, builder in self.conditional_builders.items():
+ if conditional in self.keys:
+ sample[conditional] = builder.build(sample['annotations'], sample['crop_bbox'], sample['flipped'])
+
+ if self.keys:
+ # only return specified keys
+ sample = {key: sample[key] for key in self.keys}
+ return sample
+
+ def get_image_id(self, no: int) -> str:
+ return self.image_ids[no]
+
+ def get_annotation(self, image_id: str) -> str:
+ return self.annotations[image_id]
+
+ def get_textual_label_for_category_id(self, category_id: str) -> str:
+ return self.categories[category_id].name
+
+ def get_textual_label_for_category_no(self, category_no: int) -> str:
+ return self.categories[self.get_category_id(category_no)].name
+
+ def get_category_number(self, category_id: str) -> int:
+ return self.category_number[category_id]
+
+ def get_category_id(self, category_no: int) -> str:
+ return self.category_ids[category_no]
+
+ def get_image_description(self, image_id: str) -> Dict[str, Any]:
+ raise NotImplementedError()
+
+ def get_path_structure(self):
+ raise NotImplementedError
+
+ def get_image_path(self, image_id: str) -> Path:
+ raise NotImplementedError
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_open_images.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_open_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..896c35abc3de4ebf0a8502d599f694a1a5a4292d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/annotated_objects_open_images.py
@@ -0,0 +1,137 @@
+from collections import defaultdict
+from csv import DictReader, reader as TupleReader
+from pathlib import Path
+from typing import Dict, List, Any
+import warnings
+
+from custom_controlnet_aux.diffusion_edge.taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import Annotation, Category
+from tqdm import tqdm
+
+OPEN_IMAGES_STRUCTURE = {
+ 'train': {
+ 'top_level': '',
+ 'class_descriptions': 'class-descriptions-boxable.csv',
+ 'annotations': 'oidv6-train-annotations-bbox.csv',
+ 'file_list': 'train-images-boxable.csv',
+ 'files': 'train'
+ },
+ 'validation': {
+ 'top_level': '',
+ 'class_descriptions': 'class-descriptions-boxable.csv',
+ 'annotations': 'validation-annotations-bbox.csv',
+ 'file_list': 'validation-images.csv',
+ 'files': 'validation'
+ },
+ 'test': {
+ 'top_level': '',
+ 'class_descriptions': 'class-descriptions-boxable.csv',
+ 'annotations': 'test-annotations-bbox.csv',
+ 'file_list': 'test-images.csv',
+ 'files': 'test'
+ }
+}
+
+
+def load_annotations(descriptor_path: Path, min_object_area: float, category_mapping: Dict[str, str],
+ category_no_for_id: Dict[str, int]) -> Dict[str, List[Annotation]]:
+ annotations: Dict[str, List[Annotation]] = defaultdict(list)
+ with open(descriptor_path) as file:
+ reader = DictReader(file)
+ for i, row in tqdm(enumerate(reader), total=14620000, desc='Loading OpenImages annotations'):
+ width = float(row['XMax']) - float(row['XMin'])
+ height = float(row['YMax']) - float(row['YMin'])
+ area = width * height
+ category_id = row['LabelName']
+ if category_id in category_mapping:
+ category_id = category_mapping[category_id]
+ if area >= min_object_area and category_id in category_no_for_id:
+ annotations[row['ImageID']].append(
+ Annotation(
+ id=i,
+ image_id=row['ImageID'],
+ source=row['Source'],
+ category_id=category_id,
+ category_no=category_no_for_id[category_id],
+ confidence=float(row['Confidence']),
+ bbox=(float(row['XMin']), float(row['YMin']), width, height),
+ area=area,
+ is_occluded=bool(int(row['IsOccluded'])),
+ is_truncated=bool(int(row['IsTruncated'])),
+ is_group_of=bool(int(row['IsGroupOf'])),
+ is_depiction=bool(int(row['IsDepiction'])),
+ is_inside=bool(int(row['IsInside']))
+ )
+ )
+ if 'train' in str(descriptor_path) and i < 14000000:
+ warnings.warn(f'Running with subset of Open Images. Train dataset has length [{len(annotations)}].')
+ return dict(annotations)
+
+
+def load_image_ids(csv_path: Path) -> List[str]:
+ with open(csv_path) as file:
+ reader = DictReader(file)
+ return [row['image_name'] for row in reader]
+
+
+def load_categories(csv_path: Path) -> Dict[str, Category]:
+ with open(csv_path) as file:
+ reader = TupleReader(file)
+ return {row[0]: Category(id=row[0], name=row[1], super_category=None) for row in reader}
+
+
+class AnnotatedObjectsOpenImages(AnnotatedObjectsDataset):
+ def __init__(self, use_additional_parameters: bool, **kwargs):
+ """
+ @param data_path: is the path to the following folder structure:
+ open_images/
+ │ oidv6-train-annotations-bbox.csv
+ ├── class-descriptions-boxable.csv
+ ├── oidv6-train-annotations-bbox.csv
+ ├── test
+ │ ├── 000026e7ee790996.jpg
+ │ ├── 000062a39995e348.jpg
+ │ └── ...
+ ├── test-annotations-bbox.csv
+ ├── test-images.csv
+ ├── train
+ │ ├── 000002b66c9c498e.jpg
+ │ ├── 000002b97e5471a0.jpg
+ │ └── ...
+ ├── train-images-boxable.csv
+ ├── validation
+ │ ├── 0001eeaf4aed83f9.jpg
+ │ ├── 0004886b7d043cfd.jpg
+ │ └── ...
+ ├── validation-annotations-bbox.csv
+ └── validation-images.csv
+ @param: split: one of 'train', 'validation' or 'test'
+ @param: desired image size (returns square images)
+ """
+
+ super().__init__(**kwargs)
+ self.use_additional_parameters = use_additional_parameters
+
+ self.categories = load_categories(self.paths['class_descriptions'])
+ self.filter_categories()
+ self.setup_category_id_and_number()
+
+ self.image_descriptions = {}
+ annotations = load_annotations(self.paths['annotations'], self.min_object_area, self.category_mapping,
+ self.category_number)
+ self.annotations = self.filter_object_number(annotations, self.min_object_area, self.min_objects_per_image,
+ self.max_objects_per_image)
+ self.image_ids = list(self.annotations.keys())
+ self.clean_up_annotations_and_image_descriptions()
+
+ def get_path_structure(self) -> Dict[str, str]:
+ if self.split not in OPEN_IMAGES_STRUCTURE:
+ raise ValueError(f'Split [{self.split} does not exist for Open Images data.]')
+ return OPEN_IMAGES_STRUCTURE[self.split]
+
+ def get_image_path(self, image_id: str) -> Path:
+ return self.paths['files'].joinpath(f'{image_id:0>16}.jpg')
+
+ def get_image_description(self, image_id: str) -> Dict[str, Any]:
+ image_path = self.get_image_path(image_id)
+ return {'file_path': str(image_path), 'file_name': image_path.name}
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/base.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b8bd0b3371f90bf4367b8781ae58f75526ecadc
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/base.py
@@ -0,0 +1,70 @@
+import bisect
+import numpy as np
+import custom_albumentations as albumentations
+from PIL import Image
+from torch.utils.data import Dataset, ConcatDataset
+
+
+class ConcatDatasetWithIndex(ConcatDataset):
+ """Modified from original pytorch code to return dataset idx"""
+ def __getitem__(self, idx):
+ if idx < 0:
+ if -idx > len(self):
+ raise ValueError("absolute value of index should not exceed dataset length")
+ idx = len(self) + idx
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+ if dataset_idx == 0:
+ sample_idx = idx
+ else:
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+ return self.datasets[dataset_idx][sample_idx], dataset_idx
+
+
+class ImagePaths(Dataset):
+ def __init__(self, paths, size=None, random_crop=False, labels=None):
+ self.size = size
+ self.random_crop = random_crop
+
+ self.labels = dict() if labels is None else labels
+ self.labels["file_path_"] = paths
+ self._length = len(paths)
+
+ if self.size is not None and self.size > 0:
+ self.rescaler = albumentations.SmallestMaxSize(max_size = self.size)
+ if not self.random_crop:
+ self.cropper = albumentations.CenterCrop(height=self.size,width=self.size)
+ else:
+ self.cropper = albumentations.RandomCrop(height=self.size,width=self.size)
+ self.preprocessor = albumentations.Compose([self.rescaler, self.cropper])
+ else:
+ self.preprocessor = lambda **kwargs: kwargs
+
+ def __len__(self):
+ return self._length
+
+ def preprocess_image(self, image_path):
+ image = Image.open(image_path)
+ if not image.mode == "RGB":
+ image = image.convert("RGB")
+ image = np.array(image).astype(np.uint8)
+ image = self.preprocessor(image=image)["image"]
+ image = (image/127.5 - 1.0).astype(np.float32)
+ return image
+
+ def __getitem__(self, i):
+ example = dict()
+ example["image"] = self.preprocess_image(self.labels["file_path_"][i])
+ for k in self.labels:
+ example[k] = self.labels[k][i]
+ return example
+
+
+class NumpyPaths(ImagePaths):
+ def preprocess_image(self, image_path):
+ image = np.load(image_path).squeeze(0) # 3 x 1024 x 1024
+ image = np.transpose(image, (1,2,0))
+ image = Image.fromarray(image, mode="RGB")
+ image = np.array(image).astype(np.uint8)
+ image = self.preprocessor(image=image)["image"]
+ image = (image/127.5 - 1.0).astype(np.float32)
+ return image
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/coco.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dca3c22ed3b0e619df5bf5adad448857f4aee380
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/coco.py
@@ -0,0 +1,176 @@
+import os
+import json
+import custom_albumentations as albumentations
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset
+
+from custom_controlnet_aux.diffusion_edge.taming.data.sflckr import SegmentationBase # for examples included in repo
+
+
+class Examples(SegmentationBase):
+ def __init__(self, size=256, random_crop=False, interpolation="bicubic"):
+ super().__init__(data_csv="data/coco_examples.txt",
+ data_root="data/coco_images",
+ segmentation_root="data/coco_segmentations",
+ size=size, random_crop=random_crop,
+ interpolation=interpolation,
+ n_labels=183, shift_segmentation=True)
+
+
+class CocoBase(Dataset):
+ """needed for (image, caption, segmentation) pairs"""
+ def __init__(self, size=None, dataroot="", datajson="", onehot_segmentation=False, use_stuffthing=False,
+ crop_size=None, force_no_crop=False, given_files=None):
+ self.split = self.get_split()
+ self.size = size
+ if crop_size is None:
+ self.crop_size = size
+ else:
+ self.crop_size = crop_size
+
+ self.onehot = onehot_segmentation # return segmentation as rgb or one hot
+ self.stuffthing = use_stuffthing # include thing in segmentation
+ if self.onehot and not self.stuffthing:
+ raise NotImplemented("One hot mode is only supported for the "
+ "stuffthings version because labels are stored "
+ "a bit different.")
+
+ data_json = datajson
+ with open(data_json) as json_file:
+ self.json_data = json.load(json_file)
+ self.img_id_to_captions = dict()
+ self.img_id_to_filepath = dict()
+ self.img_id_to_segmentation_filepath = dict()
+
+ assert data_json.split("/")[-1] in ["captions_train2017.json",
+ "captions_val2017.json"]
+ if self.stuffthing:
+ self.segmentation_prefix = (
+ "data/cocostuffthings/val2017" if
+ data_json.endswith("captions_val2017.json") else
+ "data/cocostuffthings/train2017")
+ else:
+ self.segmentation_prefix = (
+ "data/coco/annotations/stuff_val2017_pixelmaps" if
+ data_json.endswith("captions_val2017.json") else
+ "data/coco/annotations/stuff_train2017_pixelmaps")
+
+ imagedirs = self.json_data["images"]
+ self.labels = {"image_ids": list()}
+ for imgdir in tqdm(imagedirs, desc="ImgToPath"):
+ self.img_id_to_filepath[imgdir["id"]] = os.path.join(dataroot, imgdir["file_name"])
+ self.img_id_to_captions[imgdir["id"]] = list()
+ pngfilename = imgdir["file_name"].replace("jpg", "png")
+ self.img_id_to_segmentation_filepath[imgdir["id"]] = os.path.join(
+ self.segmentation_prefix, pngfilename)
+ if given_files is not None:
+ if pngfilename in given_files:
+ self.labels["image_ids"].append(imgdir["id"])
+ else:
+ self.labels["image_ids"].append(imgdir["id"])
+
+ capdirs = self.json_data["annotations"]
+ for capdir in tqdm(capdirs, desc="ImgToCaptions"):
+ # there are in average 5 captions per image
+ self.img_id_to_captions[capdir["image_id"]].append(np.array([capdir["caption"]]))
+
+ self.rescaler = albumentations.SmallestMaxSize(max_size=self.size)
+ if self.split=="validation":
+ self.cropper = albumentations.CenterCrop(height=self.crop_size, width=self.crop_size)
+ else:
+ self.cropper = albumentations.RandomCrop(height=self.crop_size, width=self.crop_size)
+ self.preprocessor = albumentations.Compose(
+ [self.rescaler, self.cropper],
+ additional_targets={"segmentation": "image"})
+ if force_no_crop:
+ self.rescaler = albumentations.Resize(height=self.size, width=self.size)
+ self.preprocessor = albumentations.Compose(
+ [self.rescaler],
+ additional_targets={"segmentation": "image"})
+
+ def __len__(self):
+ return len(self.labels["image_ids"])
+
+ def preprocess_image(self, image_path, segmentation_path):
+ image = Image.open(image_path)
+ if not image.mode == "RGB":
+ image = image.convert("RGB")
+ image = np.array(image).astype(np.uint8)
+
+ segmentation = Image.open(segmentation_path)
+ if not self.onehot and not segmentation.mode == "RGB":
+ segmentation = segmentation.convert("RGB")
+ segmentation = np.array(segmentation).astype(np.uint8)
+ if self.onehot:
+ assert self.stuffthing
+ # stored in caffe format: unlabeled==255. stuff and thing from
+ # 0-181. to be compatible with the labels in
+ # https://github.com/nightrome/cocostuff/blob/master/labels.txt
+ # we shift stuffthing one to the right and put unlabeled in zero
+ # as long as segmentation is uint8 shifting to right handles the
+ # latter too
+ assert segmentation.dtype == np.uint8
+ segmentation = segmentation + 1
+
+ processed = self.preprocessor(image=image, segmentation=segmentation)
+ image, segmentation = processed["image"], processed["segmentation"]
+ image = (image / 127.5 - 1.0).astype(np.float32)
+
+ if self.onehot:
+ assert segmentation.dtype == np.uint8
+ # make it one hot
+ n_labels = 183
+ flatseg = np.ravel(segmentation)
+ onehot = np.zeros((flatseg.size, n_labels), dtype=np.bool)
+ onehot[np.arange(flatseg.size), flatseg] = True
+ onehot = onehot.reshape(segmentation.shape + (n_labels,)).astype(int)
+ segmentation = onehot
+ else:
+ segmentation = (segmentation / 127.5 - 1.0).astype(np.float32)
+ return image, segmentation
+
+ def __getitem__(self, i):
+ img_path = self.img_id_to_filepath[self.labels["image_ids"][i]]
+ seg_path = self.img_id_to_segmentation_filepath[self.labels["image_ids"][i]]
+ image, segmentation = self.preprocess_image(img_path, seg_path)
+ captions = self.img_id_to_captions[self.labels["image_ids"][i]]
+ # randomly draw one of all available captions per image
+ caption = captions[np.random.randint(0, len(captions))]
+ example = {"image": image,
+ "caption": [str(caption[0])],
+ "segmentation": segmentation,
+ "img_path": img_path,
+ "seg_path": seg_path,
+ "filename_": img_path.split(os.sep)[-1]
+ }
+ return example
+
+
+class CocoImagesAndCaptionsTrain(CocoBase):
+ """returns a pair of (image, caption)"""
+ def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False):
+ super().__init__(size=size,
+ dataroot="data/coco/train2017",
+ datajson="data/coco/annotations/captions_train2017.json",
+ onehot_segmentation=onehot_segmentation,
+ use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop)
+
+ def get_split(self):
+ return "train"
+
+
+class CocoImagesAndCaptionsValidation(CocoBase):
+ """returns a pair of (image, caption)"""
+ def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False,
+ given_files=None):
+ super().__init__(size=size,
+ dataroot="data/coco/val2017",
+ datajson="data/coco/annotations/captions_val2017.json",
+ onehot_segmentation=onehot_segmentation,
+ use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop,
+ given_files=given_files)
+
+ def get_split(self):
+ return "validation"
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/objects_bbox.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/objects_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..08695dfd313f748842ac78b33e16b975ac1d7109
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/objects_bbox.py
@@ -0,0 +1,60 @@
+from itertools import cycle
+from typing import List, Tuple, Callable, Optional
+
+from PIL import Image as pil_image, ImageDraw as pil_img_draw, ImageFont
+from more_itertools.recipes import grouper
+from custom_controlnet_aux.diffusion_edge.taming.data.image_transforms import convert_pil_to_tensor
+from torch import LongTensor, Tensor
+
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import BoundingBox, Annotation
+from custom_controlnet_aux.diffusion_edge.taming.data.conditional_builder.objects_center_points import ObjectsCenterPointsConditionalBuilder
+from custom_controlnet_aux.diffusion_edge.taming.data.conditional_builder.utils import COLOR_PALETTE, WHITE, GRAY_75, BLACK, additional_parameters_string, \
+ pad_list, get_plot_font_size, absolute_bbox
+
+
+class ObjectsBoundingBoxConditionalBuilder(ObjectsCenterPointsConditionalBuilder):
+ @property
+ def object_descriptor_length(self) -> int:
+ return 3
+
+ def _make_object_descriptors(self, annotations: List[Annotation]) -> List[Tuple[int, ...]]:
+ object_triples = [
+ (self.object_representation(ann), *self.token_pair_from_bbox(ann.bbox))
+ for ann in annotations
+ ]
+ empty_triple = (self.none, self.none, self.none)
+ object_triples = pad_list(object_triples, empty_triple, self.no_max_objects)
+ return object_triples
+
+ def inverse_build(self, conditional: LongTensor) -> Tuple[List[Tuple[int, BoundingBox]], Optional[BoundingBox]]:
+ conditional_list = conditional.tolist()
+ crop_coordinates = None
+ if self.encode_crop:
+ crop_coordinates = self.bbox_from_token_pair(conditional_list[-2], conditional_list[-1])
+ conditional_list = conditional_list[:-2]
+ object_triples = grouper(conditional_list, 3)
+ assert conditional.shape[0] == self.embedding_dim
+ return [
+ (object_triple[0], self.bbox_from_token_pair(object_triple[1], object_triple[2]))
+ for object_triple in object_triples if object_triple[0] != self.none
+ ], crop_coordinates
+
+ def plot(self, conditional: LongTensor, label_for_category_no: Callable[[int], str], figure_size: Tuple[int, int],
+ line_width: int = 3, font_size: Optional[int] = None) -> Tensor:
+ plot = pil_image.new('RGB', figure_size, WHITE)
+ draw = pil_img_draw.Draw(plot)
+ font = ImageFont.truetype(
+ "/usr/share/fonts/truetype/lato/Lato-Regular.ttf",
+ size=get_plot_font_size(font_size, figure_size)
+ )
+ width, height = plot.size
+ description, crop_coordinates = self.inverse_build(conditional)
+ for (representation, bbox), color in zip(description, cycle(COLOR_PALETTE)):
+ annotation = self.representation_to_annotation(representation)
+ class_label = label_for_category_no(annotation.category_no) + ' ' + additional_parameters_string(annotation)
+ bbox = absolute_bbox(bbox, width, height)
+ draw.rectangle(bbox, outline=color, width=line_width)
+ draw.text((bbox[0] + line_width, bbox[1] + line_width), class_label, anchor='la', fill=BLACK, font=font)
+ if crop_coordinates is not None:
+ draw.rectangle(absolute_bbox(crop_coordinates, width, height), outline=GRAY_75, width=line_width)
+ return convert_pil_to_tensor(plot) / 127.5 - 1.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/objects_center_points.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/objects_center_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a1b28e5a8eedc06ee9b75e6cb13adcc5684490
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/objects_center_points.py
@@ -0,0 +1,168 @@
+import math
+import random
+import warnings
+from itertools import cycle
+from typing import List, Optional, Tuple, Callable
+
+from PIL import Image as pil_image, ImageDraw as pil_img_draw, ImageFont
+from more_itertools.recipes import grouper
+from custom_controlnet_aux.diffusion_edge.taming.data.conditional_builder.utils import COLOR_PALETTE, WHITE, GRAY_75, BLACK, FULL_CROP, filter_annotations, \
+ additional_parameters_string, horizontally_flip_bbox, pad_list, get_circle_size, get_plot_font_size, \
+ absolute_bbox, rescale_annotations
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import BoundingBox, Annotation
+from custom_controlnet_aux.diffusion_edge.taming.data.image_transforms import convert_pil_to_tensor
+from torch import LongTensor, Tensor
+
+
+class ObjectsCenterPointsConditionalBuilder:
+ def __init__(self, no_object_classes: int, no_max_objects: int, no_tokens: int, encode_crop: bool,
+ use_group_parameter: bool, use_additional_parameters: bool):
+ self.no_object_classes = no_object_classes
+ self.no_max_objects = no_max_objects
+ self.no_tokens = no_tokens
+ self.encode_crop = encode_crop
+ self.no_sections = int(math.sqrt(self.no_tokens))
+ self.use_group_parameter = use_group_parameter
+ self.use_additional_parameters = use_additional_parameters
+
+ @property
+ def none(self) -> int:
+ return self.no_tokens - 1
+
+ @property
+ def object_descriptor_length(self) -> int:
+ return 2
+
+ @property
+ def embedding_dim(self) -> int:
+ extra_length = 2 if self.encode_crop else 0
+ return self.no_max_objects * self.object_descriptor_length + extra_length
+
+ def tokenize_coordinates(self, x: float, y: float) -> int:
+ """
+ Express 2d coordinates with one number.
+ Example: assume self.no_tokens = 16, then no_sections = 4:
+ 0 0 0 0
+ 0 0 # 0
+ 0 0 0 0
+ 0 0 0 x
+ Then the # position corresponds to token 6, the x position to token 15.
+ @param x: float in [0, 1]
+ @param y: float in [0, 1]
+ @return: discrete tokenized coordinate
+ """
+ x_discrete = int(round(x * (self.no_sections - 1)))
+ y_discrete = int(round(y * (self.no_sections - 1)))
+ return y_discrete * self.no_sections + x_discrete
+
+ def coordinates_from_token(self, token: int) -> (float, float):
+ x = token % self.no_sections
+ y = token // self.no_sections
+ return x / (self.no_sections - 1), y / (self.no_sections - 1)
+
+ def bbox_from_token_pair(self, token1: int, token2: int) -> BoundingBox:
+ x0, y0 = self.coordinates_from_token(token1)
+ x1, y1 = self.coordinates_from_token(token2)
+ return x0, y0, x1 - x0, y1 - y0
+
+ def token_pair_from_bbox(self, bbox: BoundingBox) -> Tuple[int, int]:
+ return self.tokenize_coordinates(bbox[0], bbox[1]), \
+ self.tokenize_coordinates(bbox[0] + bbox[2], bbox[1] + bbox[3])
+
+ def inverse_build(self, conditional: LongTensor) \
+ -> Tuple[List[Tuple[int, Tuple[float, float]]], Optional[BoundingBox]]:
+ conditional_list = conditional.tolist()
+ crop_coordinates = None
+ if self.encode_crop:
+ crop_coordinates = self.bbox_from_token_pair(conditional_list[-2], conditional_list[-1])
+ conditional_list = conditional_list[:-2]
+ table_of_content = grouper(conditional_list, self.object_descriptor_length)
+ assert conditional.shape[0] == self.embedding_dim
+ return [
+ (object_tuple[0], self.coordinates_from_token(object_tuple[1]))
+ for object_tuple in table_of_content if object_tuple[0] != self.none
+ ], crop_coordinates
+
+ def plot(self, conditional: LongTensor, label_for_category_no: Callable[[int], str], figure_size: Tuple[int, int],
+ line_width: int = 3, font_size: Optional[int] = None) -> Tensor:
+ plot = pil_image.new('RGB', figure_size, WHITE)
+ draw = pil_img_draw.Draw(plot)
+ circle_size = get_circle_size(figure_size)
+ font = ImageFont.truetype('/usr/share/fonts/truetype/lato/Lato-Regular.ttf',
+ size=get_plot_font_size(font_size, figure_size))
+ width, height = plot.size
+ description, crop_coordinates = self.inverse_build(conditional)
+ for (representation, (x, y)), color in zip(description, cycle(COLOR_PALETTE)):
+ x_abs, y_abs = x * width, y * height
+ ann = self.representation_to_annotation(representation)
+ label = label_for_category_no(ann.category_no) + ' ' + additional_parameters_string(ann)
+ ellipse_bbox = [x_abs - circle_size, y_abs - circle_size, x_abs + circle_size, y_abs + circle_size]
+ draw.ellipse(ellipse_bbox, fill=color, width=0)
+ draw.text((x_abs, y_abs), label, anchor='md', fill=BLACK, font=font)
+ if crop_coordinates is not None:
+ draw.rectangle(absolute_bbox(crop_coordinates, width, height), outline=GRAY_75, width=line_width)
+ return convert_pil_to_tensor(plot) / 127.5 - 1.
+
+ def object_representation(self, annotation: Annotation) -> int:
+ modifier = 0
+ if self.use_group_parameter:
+ modifier |= 1 * (annotation.is_group_of is True)
+ if self.use_additional_parameters:
+ modifier |= 2 * (annotation.is_occluded is True)
+ modifier |= 4 * (annotation.is_depiction is True)
+ modifier |= 8 * (annotation.is_inside is True)
+ return annotation.category_no + self.no_object_classes * modifier
+
+ def representation_to_annotation(self, representation: int) -> Annotation:
+ category_no = representation % self.no_object_classes
+ modifier = representation // self.no_object_classes
+ # noinspection PyTypeChecker
+ return Annotation(
+ area=None, image_id=None, bbox=None, category_id=None, id=None, source=None, confidence=None,
+ category_no=category_no,
+ is_group_of=bool((modifier & 1) * self.use_group_parameter),
+ is_occluded=bool((modifier & 2) * self.use_additional_parameters),
+ is_depiction=bool((modifier & 4) * self.use_additional_parameters),
+ is_inside=bool((modifier & 8) * self.use_additional_parameters)
+ )
+
+ def _crop_encoder(self, crop_coordinates: BoundingBox) -> List[int]:
+ return list(self.token_pair_from_bbox(crop_coordinates))
+
+ def _make_object_descriptors(self, annotations: List[Annotation]) -> List[Tuple[int, ...]]:
+ object_tuples = [
+ (self.object_representation(a),
+ self.tokenize_coordinates(a.bbox[0] + a.bbox[2] / 2, a.bbox[1] + a.bbox[3] / 2))
+ for a in annotations
+ ]
+ empty_tuple = (self.none, self.none)
+ object_tuples = pad_list(object_tuples, empty_tuple, self.no_max_objects)
+ return object_tuples
+
+ def build(self, annotations: List, crop_coordinates: Optional[BoundingBox] = None, horizontal_flip: bool = False) \
+ -> LongTensor:
+ if len(annotations) == 0:
+ warnings.warn('Did not receive any annotations.')
+ if len(annotations) > self.no_max_objects:
+ warnings.warn('Received more annotations than allowed.')
+ annotations = annotations[:self.no_max_objects]
+
+ if not crop_coordinates:
+ crop_coordinates = FULL_CROP
+
+ random.shuffle(annotations)
+ annotations = filter_annotations(annotations, crop_coordinates)
+ if self.encode_crop:
+ annotations = rescale_annotations(annotations, FULL_CROP, horizontal_flip)
+ if horizontal_flip:
+ crop_coordinates = horizontally_flip_bbox(crop_coordinates)
+ extra = self._crop_encoder(crop_coordinates)
+ else:
+ annotations = rescale_annotations(annotations, crop_coordinates, horizontal_flip)
+ extra = []
+
+ object_tuples = self._make_object_descriptors(annotations)
+ flattened = [token for tuple_ in object_tuples for token in tuple_] + extra
+ assert len(flattened) == self.embedding_dim
+ assert all(0 <= value < self.no_tokens for value in flattened)
+ return LongTensor(flattened)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f51b7ec03638b0ca516e38a39d2b4b0e6f0cedd4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/conditional_builder/utils.py
@@ -0,0 +1,105 @@
+import importlib
+from typing import List, Any, Tuple, Optional
+
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import BoundingBox, Annotation
+
+# source: seaborn, color palette tab10
+COLOR_PALETTE = [(30, 118, 179), (255, 126, 13), (43, 159, 43), (213, 38, 39), (147, 102, 188),
+ (139, 85, 74), (226, 118, 193), (126, 126, 126), (187, 188, 33), (22, 189, 206)]
+BLACK = (0, 0, 0)
+GRAY_75 = (63, 63, 63)
+GRAY_50 = (127, 127, 127)
+GRAY_25 = (191, 191, 191)
+WHITE = (255, 255, 255)
+FULL_CROP = (0., 0., 1., 1.)
+
+
+def intersection_area(rectangle1: BoundingBox, rectangle2: BoundingBox) -> float:
+ """
+ Give intersection area of two rectangles.
+ @param rectangle1: (x0, y0, w, h) of first rectangle
+ @param rectangle2: (x0, y0, w, h) of second rectangle
+ """
+ rectangle1 = rectangle1[0], rectangle1[1], rectangle1[0] + rectangle1[2], rectangle1[1] + rectangle1[3]
+ rectangle2 = rectangle2[0], rectangle2[1], rectangle2[0] + rectangle2[2], rectangle2[1] + rectangle2[3]
+ x_overlap = max(0., min(rectangle1[2], rectangle2[2]) - max(rectangle1[0], rectangle2[0]))
+ y_overlap = max(0., min(rectangle1[3], rectangle2[3]) - max(rectangle1[1], rectangle2[1]))
+ return x_overlap * y_overlap
+
+
+def horizontally_flip_bbox(bbox: BoundingBox) -> BoundingBox:
+ return 1 - (bbox[0] + bbox[2]), bbox[1], bbox[2], bbox[3]
+
+
+def absolute_bbox(relative_bbox: BoundingBox, width: int, height: int) -> Tuple[int, int, int, int]:
+ bbox = relative_bbox
+ bbox = bbox[0] * width, bbox[1] * height, (bbox[0] + bbox[2]) * width, (bbox[1] + bbox[3]) * height
+ return int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
+
+
+def pad_list(list_: List, pad_element: Any, pad_to_length: int) -> List:
+ return list_ + [pad_element for _ in range(pad_to_length - len(list_))]
+
+
+def rescale_annotations(annotations: List[Annotation], crop_coordinates: BoundingBox, flip: bool) -> \
+ List[Annotation]:
+ def clamp(x: float):
+ return max(min(x, 1.), 0.)
+
+ def rescale_bbox(bbox: BoundingBox) -> BoundingBox:
+ x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
+ y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
+ w = min(bbox[2] / crop_coordinates[2], 1 - x0)
+ h = min(bbox[3] / crop_coordinates[3], 1 - y0)
+ if flip:
+ x0 = 1 - (x0 + w)
+ return x0, y0, w, h
+
+ return [a._replace(bbox=rescale_bbox(a.bbox)) for a in annotations]
+
+
+def filter_annotations(annotations: List[Annotation], crop_coordinates: BoundingBox) -> List:
+ return [a for a in annotations if intersection_area(a.bbox, crop_coordinates) > 0.0]
+
+
+def additional_parameters_string(annotation: Annotation, short: bool = True) -> str:
+ sl = slice(1) if short else slice(None)
+ string = ''
+ if not (annotation.is_group_of or annotation.is_occluded or annotation.is_depiction or annotation.is_inside):
+ return string
+ if annotation.is_group_of:
+ string += 'group'[sl] + ','
+ if annotation.is_occluded:
+ string += 'occluded'[sl] + ','
+ if annotation.is_depiction:
+ string += 'depiction'[sl] + ','
+ if annotation.is_inside:
+ string += 'inside'[sl]
+ return '(' + string.strip(",") + ')'
+
+
+def get_plot_font_size(font_size: Optional[int], figure_size: Tuple[int, int]) -> int:
+ if font_size is None:
+ font_size = 10
+ if max(figure_size) >= 256:
+ font_size = 12
+ if max(figure_size) >= 512:
+ font_size = 15
+ return font_size
+
+
+def get_circle_size(figure_size: Tuple[int, int]) -> int:
+ circle_size = 2
+ if max(figure_size) >= 256:
+ circle_size = 3
+ if max(figure_size) >= 512:
+ circle_size = 4
+ return circle_size
+
+
+def load_object_from_string(object_string: str) -> Any:
+ """
+ Source: https://stackoverflow.com/a/10773699
+ """
+ module_name, class_name = object_string.rsplit(".", 1)
+ return getattr(importlib.import_module(module_name), class_name)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/custom.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/custom.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b001a6879ffe5cadaf239aae129d64111f2dc4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/custom.py
@@ -0,0 +1,38 @@
+import os
+import numpy as np
+import custom_albumentations as albumentations
+from torch.utils.data import Dataset
+
+from custom_controlnet_aux.diffusion_edge.taming.data.base import ImagePaths, NumpyPaths, ConcatDatasetWithIndex
+
+
+class CustomBase(Dataset):
+ def __init__(self, *args, **kwargs):
+ super().__init__()
+ self.data = None
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, i):
+ example = self.data[i]
+ return example
+
+
+
+class CustomTrain(CustomBase):
+ def __init__(self, size, training_images_list_file):
+ super().__init__()
+ with open(training_images_list_file, "r") as f:
+ paths = f.read().splitlines()
+ self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+
+
+class CustomTest(CustomBase):
+ def __init__(self, size, test_images_list_file):
+ super().__init__()
+ with open(test_images_list_file, "r") as f:
+ paths = f.read().splitlines()
+ self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/faceshq.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/faceshq.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed02ecdf603d8164d6cc59c3b89b2b0591465df8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/faceshq.py
@@ -0,0 +1,134 @@
+import os
+import numpy as np
+import custom_albumentations as albumentations
+from torch.utils.data import Dataset
+
+from custom_controlnet_aux.diffusion_edge.taming.data.base import ImagePaths, NumpyPaths, ConcatDatasetWithIndex
+
+
+class FacesBase(Dataset):
+ def __init__(self, *args, **kwargs):
+ super().__init__()
+ self.data = None
+ self.keys = None
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, i):
+ example = self.data[i]
+ ex = {}
+ if self.keys is not None:
+ for k in self.keys:
+ ex[k] = example[k]
+ else:
+ ex = example
+ return ex
+
+
+class CelebAHQTrain(FacesBase):
+ def __init__(self, size, keys=None):
+ super().__init__()
+ root = "data/celebahq"
+ with open("data/celebahqtrain.txt", "r") as f:
+ relpaths = f.read().splitlines()
+ paths = [os.path.join(root, relpath) for relpath in relpaths]
+ self.data = NumpyPaths(paths=paths, size=size, random_crop=False)
+ self.keys = keys
+
+
+class CelebAHQValidation(FacesBase):
+ def __init__(self, size, keys=None):
+ super().__init__()
+ root = "data/celebahq"
+ with open("data/celebahqvalidation.txt", "r") as f:
+ relpaths = f.read().splitlines()
+ paths = [os.path.join(root, relpath) for relpath in relpaths]
+ self.data = NumpyPaths(paths=paths, size=size, random_crop=False)
+ self.keys = keys
+
+
+class FFHQTrain(FacesBase):
+ def __init__(self, size, keys=None):
+ super().__init__()
+ root = "data/ffhq"
+ with open("data/ffhqtrain.txt", "r") as f:
+ relpaths = f.read().splitlines()
+ paths = [os.path.join(root, relpath) for relpath in relpaths]
+ self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+ self.keys = keys
+
+
+class FFHQValidation(FacesBase):
+ def __init__(self, size, keys=None):
+ super().__init__()
+ root = "data/ffhq"
+ with open("data/ffhqvalidation.txt", "r") as f:
+ relpaths = f.read().splitlines()
+ paths = [os.path.join(root, relpath) for relpath in relpaths]
+ self.data = ImagePaths(paths=paths, size=size, random_crop=False)
+ self.keys = keys
+
+
+class FacesHQTrain(Dataset):
+ # CelebAHQ [0] + FFHQ [1]
+ def __init__(self, size, keys=None, crop_size=None, coord=False):
+ d1 = CelebAHQTrain(size=size, keys=keys)
+ d2 = FFHQTrain(size=size, keys=keys)
+ self.data = ConcatDatasetWithIndex([d1, d2])
+ self.coord = coord
+ if crop_size is not None:
+ self.cropper = albumentations.RandomCrop(height=crop_size,width=crop_size)
+ if self.coord:
+ self.cropper = albumentations.Compose([self.cropper],
+ additional_targets={"coord": "image"})
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, i):
+ ex, y = self.data[i]
+ if hasattr(self, "cropper"):
+ if not self.coord:
+ out = self.cropper(image=ex["image"])
+ ex["image"] = out["image"]
+ else:
+ h,w,_ = ex["image"].shape
+ coord = np.arange(h*w).reshape(h,w,1)/(h*w)
+ out = self.cropper(image=ex["image"], coord=coord)
+ ex["image"] = out["image"]
+ ex["coord"] = out["coord"]
+ ex["class"] = y
+ return ex
+
+
+class FacesHQValidation(Dataset):
+ # CelebAHQ [0] + FFHQ [1]
+ def __init__(self, size, keys=None, crop_size=None, coord=False):
+ d1 = CelebAHQValidation(size=size, keys=keys)
+ d2 = FFHQValidation(size=size, keys=keys)
+ self.data = ConcatDatasetWithIndex([d1, d2])
+ self.coord = coord
+ if crop_size is not None:
+ self.cropper = albumentations.CenterCrop(height=crop_size,width=crop_size)
+ if self.coord:
+ self.cropper = albumentations.Compose([self.cropper],
+ additional_targets={"coord": "image"})
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, i):
+ ex, y = self.data[i]
+ if hasattr(self, "cropper"):
+ if not self.coord:
+ out = self.cropper(image=ex["image"])
+ ex["image"] = out["image"]
+ else:
+ h,w,_ = ex["image"].shape
+ coord = np.arange(h*w).reshape(h,w,1)/(h*w)
+ out = self.cropper(image=ex["image"], coord=coord)
+ ex["image"] = out["image"]
+ ex["coord"] = out["coord"]
+ ex["class"] = y
+ return ex
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/helper_types.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/helper_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5eadaa0aee77671b52ea91af79ba6dd87f18439
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/helper_types.py
@@ -0,0 +1,49 @@
+from typing import Dict, Tuple, Optional, NamedTuple, Union
+from PIL.Image import Image as pil_image
+from torch import Tensor
+
+try:
+ from typing import Literal
+except ImportError:
+ from typing_extensions import Literal
+
+Image = Union[Tensor, pil_image]
+BoundingBox = Tuple[float, float, float, float] # x0, y0, w, h
+CropMethodType = Literal['none', 'random', 'center', 'random-2d']
+SplitType = Literal['train', 'validation', 'test']
+
+
+class ImageDescription(NamedTuple):
+ id: int
+ file_name: str
+ original_size: Tuple[int, int] # w, h
+ url: Optional[str] = None
+ license: Optional[int] = None
+ coco_url: Optional[str] = None
+ date_captured: Optional[str] = None
+ flickr_url: Optional[str] = None
+ flickr_id: Optional[str] = None
+ coco_id: Optional[str] = None
+
+
+class Category(NamedTuple):
+ id: str
+ super_category: Optional[str]
+ name: str
+
+
+class Annotation(NamedTuple):
+ area: float
+ image_id: str
+ bbox: BoundingBox
+ category_no: int
+ category_id: str
+ id: Optional[int] = None
+ source: Optional[str] = None
+ confidence: Optional[float] = None
+ is_group_of: Optional[bool] = None
+ is_truncated: Optional[bool] = None
+ is_occluded: Optional[bool] = None
+ is_depiction: Optional[bool] = None
+ is_inside: Optional[bool] = None
+ segmentation: Optional[Dict] = None
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/image_transforms.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/image_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..d32fe231e5dedb37f1b5f1762501514e1be4cc33
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/image_transforms.py
@@ -0,0 +1,132 @@
+import random
+import warnings
+from typing import Union
+
+import torch
+from torch import Tensor
+from torchvision.transforms import RandomCrop, functional as F, CenterCrop, RandomHorizontalFlip, PILToTensor
+from torchvision.transforms.functional import _get_image_size as get_image_size
+
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import BoundingBox, Image
+
+pil_to_tensor = PILToTensor()
+
+
+def convert_pil_to_tensor(image: Image) -> Tensor:
+ with warnings.catch_warnings():
+ # to filter PyTorch UserWarning as described here: https://github.com/pytorch/vision/issues/2194
+ warnings.simplefilter("ignore")
+ return pil_to_tensor(image)
+
+
+class RandomCrop1dReturnCoordinates(RandomCrop):
+ def forward(self, img: Image) -> (BoundingBox, Image):
+ """
+ Additionally to cropping, returns the relative coordinates of the crop bounding box.
+ Args:
+ img (PIL Image or Tensor): Image to be cropped.
+
+ Returns:
+ Bounding box: x0, y0, w, h
+ PIL Image or Tensor: Cropped image.
+
+ Based on:
+ torchvision.transforms.RandomCrop, torchvision 1.7.0
+ """
+ if self.padding is not None:
+ img = F.pad(img, self.padding, self.fill, self.padding_mode)
+
+ width, height = get_image_size(img)
+ # pad the width if needed
+ if self.pad_if_needed and width < self.size[1]:
+ padding = [self.size[1] - width, 0]
+ img = F.pad(img, padding, self.fill, self.padding_mode)
+ # pad the height if needed
+ if self.pad_if_needed and height < self.size[0]:
+ padding = [0, self.size[0] - height]
+ img = F.pad(img, padding, self.fill, self.padding_mode)
+
+ i, j, h, w = self.get_params(img, self.size)
+ bbox = (j / width, i / height, w / width, h / height) # x0, y0, w, h
+ return bbox, F.crop(img, i, j, h, w)
+
+
+class Random2dCropReturnCoordinates(torch.nn.Module):
+ """
+ Additionally to cropping, returns the relative coordinates of the crop bounding box.
+ Args:
+ img (PIL Image or Tensor): Image to be cropped.
+
+ Returns:
+ Bounding box: x0, y0, w, h
+ PIL Image or Tensor: Cropped image.
+
+ Based on:
+ torchvision.transforms.RandomCrop, torchvision 1.7.0
+ """
+
+ def __init__(self, min_size: int):
+ super().__init__()
+ self.min_size = min_size
+
+ def forward(self, img: Image) -> (BoundingBox, Image):
+ width, height = get_image_size(img)
+ max_size = min(width, height)
+ if max_size <= self.min_size:
+ size = max_size
+ else:
+ size = random.randint(self.min_size, max_size)
+ top = random.randint(0, height - size)
+ left = random.randint(0, width - size)
+ bbox = left / width, top / height, size / width, size / height
+ return bbox, F.crop(img, top, left, size, size)
+
+
+class CenterCropReturnCoordinates(CenterCrop):
+ @staticmethod
+ def get_bbox_of_center_crop(width: int, height: int) -> BoundingBox:
+ if width > height:
+ w = height / width
+ h = 1.0
+ x0 = 0.5 - w / 2
+ y0 = 0.
+ else:
+ w = 1.0
+ h = width / height
+ x0 = 0.
+ y0 = 0.5 - h / 2
+ return x0, y0, w, h
+
+ def forward(self, img: Union[Image, Tensor]) -> (BoundingBox, Union[Image, Tensor]):
+ """
+ Additionally to cropping, returns the relative coordinates of the crop bounding box.
+ Args:
+ img (PIL Image or Tensor): Image to be cropped.
+
+ Returns:
+ Bounding box: x0, y0, w, h
+ PIL Image or Tensor: Cropped image.
+ Based on:
+ torchvision.transforms.RandomHorizontalFlip (version 1.7.0)
+ """
+ width, height = get_image_size(img)
+ return self.get_bbox_of_center_crop(width, height), F.center_crop(img, self.size)
+
+
+class RandomHorizontalFlipReturn(RandomHorizontalFlip):
+ def forward(self, img: Image) -> (bool, Image):
+ """
+ Additionally to flipping, returns a boolean whether it was flipped or not.
+ Args:
+ img (PIL Image or Tensor): Image to be flipped.
+
+ Returns:
+ flipped: whether the image was flipped or not
+ PIL Image or Tensor: Randomly flipped image.
+
+ Based on:
+ torchvision.transforms.RandomHorizontalFlip (version 1.7.0)
+ """
+ if torch.rand(1) < self.p:
+ return True, F.hflip(img)
+ return False, img
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/imagenet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/imagenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..97138d05e8f3a5a3f08b483e6a4a83ed58e3f456
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/imagenet.py
@@ -0,0 +1,558 @@
+import os, tarfile, glob, shutil
+import yaml
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+import custom_albumentations as albumentations
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+
+from custom_controlnet_aux.diffusion_edge.taming.data.base import ImagePaths
+from custom_controlnet_aux.diffusion_edge.taming.util import download, retrieve
+import taming.data.utils as bdu
+
+
+def give_synsets_from_indices(indices, path_to_yaml="data/imagenet_idx_to_synset.yaml"):
+ synsets = []
+ with open(path_to_yaml) as f:
+ di2s = yaml.load(f)
+ for idx in indices:
+ synsets.append(str(di2s[idx]))
+ print("Using {} different synsets for construction of Restriced Imagenet.".format(len(synsets)))
+ return synsets
+
+
+def str_to_indices(string):
+ """Expects a string in the format '32-123, 256, 280-321'"""
+ assert not string.endswith(","), "provided string '{}' ends with a comma, pls remove it".format(string)
+ subs = string.split(",")
+ indices = []
+ for sub in subs:
+ subsubs = sub.split("-")
+ assert len(subsubs) > 0
+ if len(subsubs) == 1:
+ indices.append(int(subsubs[0]))
+ else:
+ rang = [j for j in range(int(subsubs[0]), int(subsubs[1]))]
+ indices.extend(rang)
+ return sorted(indices)
+
+
+class ImageNetBase(Dataset):
+ def __init__(self, config=None):
+ self.config = config or OmegaConf.create()
+ if not type(self.config)==dict:
+ self.config = OmegaConf.to_container(self.config)
+ self._prepare()
+ self._prepare_synset_to_human()
+ self._prepare_idx_to_synset()
+ self._load()
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, i):
+ return self.data[i]
+
+ def _prepare(self):
+ raise NotImplementedError()
+
+ def _filter_relpaths(self, relpaths):
+ ignore = set([
+ "n06596364_9591.JPEG",
+ ])
+ relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
+ if "sub_indices" in self.config:
+ indices = str_to_indices(self.config["sub_indices"])
+ synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings
+ files = []
+ for rpath in relpaths:
+ syn = rpath.split("/")[0]
+ if syn in synsets:
+ files.append(rpath)
+ return files
+ else:
+ return relpaths
+
+ def _prepare_synset_to_human(self):
+ SIZE = 2655750
+ URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
+ self.human_dict = os.path.join(self.root, "synset_human.txt")
+ if (not os.path.exists(self.human_dict) or
+ not os.path.getsize(self.human_dict)==SIZE):
+ download(URL, self.human_dict)
+
+ def _prepare_idx_to_synset(self):
+ URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
+ self.idx2syn = os.path.join(self.root, "index_synset.yaml")
+ if (not os.path.exists(self.idx2syn)):
+ download(URL, self.idx2syn)
+
+ def _load(self):
+ with open(self.txt_filelist, "r") as f:
+ self.relpaths = f.read().splitlines()
+ l1 = len(self.relpaths)
+ self.relpaths = self._filter_relpaths(self.relpaths)
+ print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
+
+ self.synsets = [p.split("/")[0] for p in self.relpaths]
+ self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
+
+ unique_synsets = np.unique(self.synsets)
+ class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
+ self.class_labels = [class_dict[s] for s in self.synsets]
+
+ with open(self.human_dict, "r") as f:
+ human_dict = f.read().splitlines()
+ human_dict = dict(line.split(maxsplit=1) for line in human_dict)
+
+ self.human_labels = [human_dict[s] for s in self.synsets]
+
+ labels = {
+ "relpath": np.array(self.relpaths),
+ "synsets": np.array(self.synsets),
+ "class_label": np.array(self.class_labels),
+ "human_label": np.array(self.human_labels),
+ }
+ self.data = ImagePaths(self.abspaths,
+ labels=labels,
+ size=retrieve(self.config, "size", default=0),
+ random_crop=self.random_crop)
+
+
+class ImageNetTrain(ImageNetBase):
+ NAME = "ILSVRC2012_train"
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+ AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
+ FILES = [
+ "ILSVRC2012_img_train.tar",
+ ]
+ SIZES = [
+ 147897477120,
+ ]
+
+ def _prepare(self):
+ self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
+ default=True)
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+ self.datadir = os.path.join(self.root, "data")
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
+ self.expected_length = 1281167
+ if not bdu.is_prepared(self.root):
+ # prep
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
+
+ datadir = self.datadir
+ if not os.path.exists(datadir):
+ path = os.path.join(self.root, self.FILES[0])
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+ import academictorrents as at
+ atpath = at.get(self.AT_HASH, datastore=self.root)
+ assert atpath == path
+
+ print("Extracting {} to {}".format(path, datadir))
+ os.makedirs(datadir, exist_ok=True)
+ with tarfile.open(path, "r:") as tar:
+ tar.extractall(path=datadir)
+
+ print("Extracting sub-tars.")
+ subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
+ for subpath in tqdm(subpaths):
+ subdir = subpath[:-len(".tar")]
+ os.makedirs(subdir, exist_ok=True)
+ with tarfile.open(subpath, "r:") as tar:
+ tar.extractall(path=subdir)
+
+
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+ filelist = sorted(filelist)
+ filelist = "\n".join(filelist)+"\n"
+ with open(self.txt_filelist, "w") as f:
+ f.write(filelist)
+
+ bdu.mark_prepared(self.root)
+
+
+class ImageNetValidation(ImageNetBase):
+ NAME = "ILSVRC2012_validation"
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+ AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
+ VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
+ FILES = [
+ "ILSVRC2012_img_val.tar",
+ "validation_synset.txt",
+ ]
+ SIZES = [
+ 6744924160,
+ 1950000,
+ ]
+
+ def _prepare(self):
+ self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
+ default=False)
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+ self.datadir = os.path.join(self.root, "data")
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
+ self.expected_length = 50000
+ if not bdu.is_prepared(self.root):
+ # prep
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
+
+ datadir = self.datadir
+ if not os.path.exists(datadir):
+ path = os.path.join(self.root, self.FILES[0])
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+ import academictorrents as at
+ atpath = at.get(self.AT_HASH, datastore=self.root)
+ assert atpath == path
+
+ print("Extracting {} to {}".format(path, datadir))
+ os.makedirs(datadir, exist_ok=True)
+ with tarfile.open(path, "r:") as tar:
+ tar.extractall(path=datadir)
+
+ vspath = os.path.join(self.root, self.FILES[1])
+ if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
+ download(self.VS_URL, vspath)
+
+ with open(vspath, "r") as f:
+ synset_dict = f.read().splitlines()
+ synset_dict = dict(line.split() for line in synset_dict)
+
+ print("Reorganizing into synset folders")
+ synsets = np.unique(list(synset_dict.values()))
+ for s in synsets:
+ os.makedirs(os.path.join(datadir, s), exist_ok=True)
+ for k, v in synset_dict.items():
+ src = os.path.join(datadir, k)
+ dst = os.path.join(datadir, v)
+ shutil.move(src, dst)
+
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+ filelist = sorted(filelist)
+ filelist = "\n".join(filelist)+"\n"
+ with open(self.txt_filelist, "w") as f:
+ f.write(filelist)
+
+ bdu.mark_prepared(self.root)
+
+
+def get_preprocessor(size=None, random_crop=False, additional_targets=None,
+ crop_size=None):
+ if size is not None and size > 0:
+ transforms = list()
+ rescaler = albumentations.SmallestMaxSize(max_size = size)
+ transforms.append(rescaler)
+ if not random_crop:
+ cropper = albumentations.CenterCrop(height=size,width=size)
+ transforms.append(cropper)
+ else:
+ cropper = albumentations.RandomCrop(height=size,width=size)
+ transforms.append(cropper)
+ flipper = albumentations.HorizontalFlip()
+ transforms.append(flipper)
+ preprocessor = albumentations.Compose(transforms,
+ additional_targets=additional_targets)
+ elif crop_size is not None and crop_size > 0:
+ if not random_crop:
+ cropper = albumentations.CenterCrop(height=crop_size,width=crop_size)
+ else:
+ cropper = albumentations.RandomCrop(height=crop_size,width=crop_size)
+ transforms = [cropper]
+ preprocessor = albumentations.Compose(transforms,
+ additional_targets=additional_targets)
+ else:
+ preprocessor = lambda **kwargs: kwargs
+ return preprocessor
+
+
+def rgba_to_depth(x):
+ assert x.dtype == np.uint8
+ assert len(x.shape) == 3 and x.shape[2] == 4
+ y = x.copy()
+ y.dtype = np.float32
+ y = y.reshape(x.shape[:2])
+ return np.ascontiguousarray(y)
+
+
+class BaseWithDepth(Dataset):
+ DEFAULT_DEPTH_ROOT="data/imagenet_depth"
+
+ def __init__(self, config=None, size=None, random_crop=False,
+ crop_size=None, root=None):
+ self.config = config
+ self.base_dset = self.get_base_dset()
+ self.preprocessor = get_preprocessor(
+ size=size,
+ crop_size=crop_size,
+ random_crop=random_crop,
+ additional_targets={"depth": "image"})
+ self.crop_size = crop_size
+ if self.crop_size is not None:
+ self.rescaler = albumentations.Compose(
+ [albumentations.SmallestMaxSize(max_size = self.crop_size)],
+ additional_targets={"depth": "image"})
+ if root is not None:
+ self.DEFAULT_DEPTH_ROOT = root
+
+ def __len__(self):
+ return len(self.base_dset)
+
+ def preprocess_depth(self, path):
+ rgba = np.array(Image.open(path))
+ depth = rgba_to_depth(rgba)
+ depth = (depth - depth.min())/max(1e-8, depth.max()-depth.min())
+ depth = 2.0*depth-1.0
+ return depth
+
+ def __getitem__(self, i):
+ e = self.base_dset[i]
+ e["depth"] = self.preprocess_depth(self.get_depth_path(e))
+ # up if necessary
+ h,w,c = e["image"].shape
+ if self.crop_size and min(h,w) < self.crop_size:
+ # have to upscale to be able to crop - this just uses bilinear
+ out = self.rescaler(image=e["image"], depth=e["depth"])
+ e["image"] = out["image"]
+ e["depth"] = out["depth"]
+ transformed = self.preprocessor(image=e["image"], depth=e["depth"])
+ e["image"] = transformed["image"]
+ e["depth"] = transformed["depth"]
+ return e
+
+
+class ImageNetTrainWithDepth(BaseWithDepth):
+ # default to random_crop=True
+ def __init__(self, random_crop=True, sub_indices=None, **kwargs):
+ self.sub_indices = sub_indices
+ super().__init__(random_crop=random_crop, **kwargs)
+
+ def get_base_dset(self):
+ if self.sub_indices is None:
+ return ImageNetTrain()
+ else:
+ return ImageNetTrain({"sub_indices": self.sub_indices})
+
+ def get_depth_path(self, e):
+ fid = os.path.splitext(e["relpath"])[0]+".png"
+ fid = os.path.join(self.DEFAULT_DEPTH_ROOT, "train", fid)
+ return fid
+
+
+class ImageNetValidationWithDepth(BaseWithDepth):
+ def __init__(self, sub_indices=None, **kwargs):
+ self.sub_indices = sub_indices
+ super().__init__(**kwargs)
+
+ def get_base_dset(self):
+ if self.sub_indices is None:
+ return ImageNetValidation()
+ else:
+ return ImageNetValidation({"sub_indices": self.sub_indices})
+
+ def get_depth_path(self, e):
+ fid = os.path.splitext(e["relpath"])[0]+".png"
+ fid = os.path.join(self.DEFAULT_DEPTH_ROOT, "val", fid)
+ return fid
+
+
+class RINTrainWithDepth(ImageNetTrainWithDepth):
+ def __init__(self, config=None, size=None, random_crop=True, crop_size=None):
+ sub_indices = "30-32, 33-37, 151-268, 281-285, 80-100, 365-382, 389-397, 118-121, 300-319"
+ super().__init__(config=config, size=size, random_crop=random_crop,
+ sub_indices=sub_indices, crop_size=crop_size)
+
+
+class RINValidationWithDepth(ImageNetValidationWithDepth):
+ def __init__(self, config=None, size=None, random_crop=False, crop_size=None):
+ sub_indices = "30-32, 33-37, 151-268, 281-285, 80-100, 365-382, 389-397, 118-121, 300-319"
+ super().__init__(config=config, size=size, random_crop=random_crop,
+ sub_indices=sub_indices, crop_size=crop_size)
+
+
+class DRINExamples(Dataset):
+ def __init__(self):
+ self.preprocessor = get_preprocessor(size=256, additional_targets={"depth": "image"})
+ with open("data/drin_examples.txt", "r") as f:
+ relpaths = f.read().splitlines()
+ self.image_paths = [os.path.join("data/drin_images",
+ relpath) for relpath in relpaths]
+ self.depth_paths = [os.path.join("data/drin_depth",
+ relpath.replace(".JPEG", ".png")) for relpath in relpaths]
+
+ def __len__(self):
+ return len(self.image_paths)
+
+ def preprocess_image(self, image_path):
+ image = Image.open(image_path)
+ if not image.mode == "RGB":
+ image = image.convert("RGB")
+ image = np.array(image).astype(np.uint8)
+ image = self.preprocessor(image=image)["image"]
+ image = (image/127.5 - 1.0).astype(np.float32)
+ return image
+
+ def preprocess_depth(self, path):
+ rgba = np.array(Image.open(path))
+ depth = rgba_to_depth(rgba)
+ depth = (depth - depth.min())/max(1e-8, depth.max()-depth.min())
+ depth = 2.0*depth-1.0
+ return depth
+
+ def __getitem__(self, i):
+ e = dict()
+ e["image"] = self.preprocess_image(self.image_paths[i])
+ e["depth"] = self.preprocess_depth(self.depth_paths[i])
+ transformed = self.preprocessor(image=e["image"], depth=e["depth"])
+ e["image"] = transformed["image"]
+ e["depth"] = transformed["depth"]
+ return e
+
+
+def imscale(x, factor, keepshapes=False, keepmode="bicubic"):
+ if factor is None or factor==1:
+ return x
+
+ dtype = x.dtype
+ assert dtype in [np.float32, np.float64]
+ assert x.min() >= -1
+ assert x.max() <= 1
+
+ keepmode = {"nearest": Image.NEAREST, "bilinear": Image.BILINEAR,
+ "bicubic": Image.BICUBIC}[keepmode]
+
+ lr = (x+1.0)*127.5
+ lr = lr.clip(0,255).astype(np.uint8)
+ lr = Image.fromarray(lr)
+
+ h, w, _ = x.shape
+ nh = h//factor
+ nw = w//factor
+ assert nh > 0 and nw > 0, (nh, nw)
+
+ lr = lr.resize((nw,nh), Image.BICUBIC)
+ if keepshapes:
+ lr = lr.resize((w,h), keepmode)
+ lr = np.array(lr)/127.5-1.0
+ lr = lr.astype(dtype)
+
+ return lr
+
+
+class ImageNetScale(Dataset):
+ def __init__(self, size=None, crop_size=None, random_crop=False,
+ up_factor=None, hr_factor=None, keep_mode="bicubic"):
+ self.base = self.get_base()
+
+ self.size = size
+ self.crop_size = crop_size if crop_size is not None else self.size
+ self.random_crop = random_crop
+ self.up_factor = up_factor
+ self.hr_factor = hr_factor
+ self.keep_mode = keep_mode
+
+ transforms = list()
+
+ if self.size is not None and self.size > 0:
+ rescaler = albumentations.SmallestMaxSize(max_size = self.size)
+ self.rescaler = rescaler
+ transforms.append(rescaler)
+
+ if self.crop_size is not None and self.crop_size > 0:
+ if len(transforms) == 0:
+ self.rescaler = albumentations.SmallestMaxSize(max_size = self.crop_size)
+
+ if not self.random_crop:
+ cropper = albumentations.CenterCrop(height=self.crop_size,width=self.crop_size)
+ else:
+ cropper = albumentations.RandomCrop(height=self.crop_size,width=self.crop_size)
+ transforms.append(cropper)
+
+ if len(transforms) > 0:
+ if self.up_factor is not None:
+ additional_targets = {"lr": "image"}
+ else:
+ additional_targets = None
+ self.preprocessor = albumentations.Compose(transforms,
+ additional_targets=additional_targets)
+ else:
+ self.preprocessor = lambda **kwargs: kwargs
+
+ def __len__(self):
+ return len(self.base)
+
+ def __getitem__(self, i):
+ example = self.base[i]
+ image = example["image"]
+ # adjust resolution
+ image = imscale(image, self.hr_factor, keepshapes=False)
+ h,w,c = image.shape
+ if self.crop_size and min(h,w) < self.crop_size:
+ # have to upscale to be able to crop - this just uses bilinear
+ image = self.rescaler(image=image)["image"]
+ if self.up_factor is None:
+ image = self.preprocessor(image=image)["image"]
+ example["image"] = image
+ else:
+ lr = imscale(image, self.up_factor, keepshapes=True,
+ keepmode=self.keep_mode)
+
+ out = self.preprocessor(image=image, lr=lr)
+ example["image"] = out["image"]
+ example["lr"] = out["lr"]
+
+ return example
+
+class ImageNetScaleTrain(ImageNetScale):
+ def __init__(self, random_crop=True, **kwargs):
+ super().__init__(random_crop=random_crop, **kwargs)
+
+ def get_base(self):
+ return ImageNetTrain()
+
+class ImageNetScaleValidation(ImageNetScale):
+ def get_base(self):
+ return ImageNetValidation()
+
+
+from skimage.feature import canny
+from skimage.color import rgb2gray
+
+
+class ImageNetEdges(ImageNetScale):
+ def __init__(self, up_factor=1, **kwargs):
+ super().__init__(up_factor=1, **kwargs)
+
+ def __getitem__(self, i):
+ example = self.base[i]
+ image = example["image"]
+ h,w,c = image.shape
+ if self.crop_size and min(h,w) < self.crop_size:
+ # have to upscale to be able to crop - this just uses bilinear
+ image = self.rescaler(image=image)["image"]
+
+ lr = canny(rgb2gray(image), sigma=2)
+ lr = lr.astype(np.float32)
+ lr = lr[:,:,None][:,:,[0,0,0]]
+
+ out = self.preprocessor(image=image, lr=lr)
+ example["image"] = out["image"]
+ example["lr"] = out["lr"]
+
+ return example
+
+
+class ImageNetEdgesTrain(ImageNetEdges):
+ def __init__(self, random_crop=True, **kwargs):
+ super().__init__(random_crop=random_crop, **kwargs)
+
+ def get_base(self):
+ return ImageNetTrain()
+
+class ImageNetEdgesValidation(ImageNetEdges):
+ def get_base(self):
+ return ImageNetValidation()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/open_images_helper.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/open_images_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba023bb416d7e59f62d24bc4a2e7dbe54061fa17
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/open_images_helper.py
@@ -0,0 +1,379 @@
+open_images_unify_categories_for_coco = {
+ '/m/03bt1vf': '/m/01g317',
+ '/m/04yx4': '/m/01g317',
+ '/m/05r655': '/m/01g317',
+ '/m/01bl7v': '/m/01g317',
+ '/m/0cnyhnx': '/m/01xq0k1',
+ '/m/01226z': '/m/018xm',
+ '/m/05ctyq': '/m/018xm',
+ '/m/058qzx': '/m/04ctx',
+ '/m/06pcq': '/m/0l515',
+ '/m/03m3pdh': '/m/02crq1',
+ '/m/046dlr': '/m/01x3z',
+ '/m/0h8mzrc': '/m/01x3z',
+}
+
+
+top_300_classes_plus_coco_compatibility = [
+ ('Man', 1060962),
+ ('Clothing', 986610),
+ ('Tree', 748162),
+ ('Woman', 611896),
+ ('Person', 610294),
+ ('Human face', 442948),
+ ('Girl', 175399),
+ ('Building', 162147),
+ ('Car', 159135),
+ ('Plant', 155704),
+ ('Human body', 137073),
+ ('Flower', 133128),
+ ('Window', 127485),
+ ('Human arm', 118380),
+ ('House', 114365),
+ ('Wheel', 111684),
+ ('Suit', 99054),
+ ('Human hair', 98089),
+ ('Human head', 92763),
+ ('Chair', 88624),
+ ('Boy', 79849),
+ ('Table', 73699),
+ ('Jeans', 57200),
+ ('Tire', 55725),
+ ('Skyscraper', 53321),
+ ('Food', 52400),
+ ('Footwear', 50335),
+ ('Dress', 50236),
+ ('Human leg', 47124),
+ ('Toy', 46636),
+ ('Tower', 45605),
+ ('Boat', 43486),
+ ('Land vehicle', 40541),
+ ('Bicycle wheel', 34646),
+ ('Palm tree', 33729),
+ ('Fashion accessory', 32914),
+ ('Glasses', 31940),
+ ('Bicycle', 31409),
+ ('Furniture', 30656),
+ ('Sculpture', 29643),
+ ('Bottle', 27558),
+ ('Dog', 26980),
+ ('Snack', 26796),
+ ('Human hand', 26664),
+ ('Bird', 25791),
+ ('Book', 25415),
+ ('Guitar', 24386),
+ ('Jacket', 23998),
+ ('Poster', 22192),
+ ('Dessert', 21284),
+ ('Baked goods', 20657),
+ ('Drink', 19754),
+ ('Flag', 18588),
+ ('Houseplant', 18205),
+ ('Tableware', 17613),
+ ('Airplane', 17218),
+ ('Door', 17195),
+ ('Sports uniform', 17068),
+ ('Shelf', 16865),
+ ('Drum', 16612),
+ ('Vehicle', 16542),
+ ('Microphone', 15269),
+ ('Street light', 14957),
+ ('Cat', 14879),
+ ('Fruit', 13684),
+ ('Fast food', 13536),
+ ('Animal', 12932),
+ ('Vegetable', 12534),
+ ('Train', 12358),
+ ('Horse', 11948),
+ ('Flowerpot', 11728),
+ ('Motorcycle', 11621),
+ ('Fish', 11517),
+ ('Desk', 11405),
+ ('Helmet', 10996),
+ ('Truck', 10915),
+ ('Bus', 10695),
+ ('Hat', 10532),
+ ('Auto part', 10488),
+ ('Musical instrument', 10303),
+ ('Sunglasses', 10207),
+ ('Picture frame', 10096),
+ ('Sports equipment', 10015),
+ ('Shorts', 9999),
+ ('Wine glass', 9632),
+ ('Duck', 9242),
+ ('Wine', 9032),
+ ('Rose', 8781),
+ ('Tie', 8693),
+ ('Butterfly', 8436),
+ ('Beer', 7978),
+ ('Cabinetry', 7956),
+ ('Laptop', 7907),
+ ('Insect', 7497),
+ ('Goggles', 7363),
+ ('Shirt', 7098),
+ ('Dairy Product', 7021),
+ ('Marine invertebrates', 7014),
+ ('Cattle', 7006),
+ ('Trousers', 6903),
+ ('Van', 6843),
+ ('Billboard', 6777),
+ ('Balloon', 6367),
+ ('Human nose', 6103),
+ ('Tent', 6073),
+ ('Camera', 6014),
+ ('Doll', 6002),
+ ('Coat', 5951),
+ ('Mobile phone', 5758),
+ ('Swimwear', 5729),
+ ('Strawberry', 5691),
+ ('Stairs', 5643),
+ ('Goose', 5599),
+ ('Umbrella', 5536),
+ ('Cake', 5508),
+ ('Sun hat', 5475),
+ ('Bench', 5310),
+ ('Bookcase', 5163),
+ ('Bee', 5140),
+ ('Computer monitor', 5078),
+ ('Hiking equipment', 4983),
+ ('Office building', 4981),
+ ('Coffee cup', 4748),
+ ('Curtain', 4685),
+ ('Plate', 4651),
+ ('Box', 4621),
+ ('Tomato', 4595),
+ ('Coffee table', 4529),
+ ('Office supplies', 4473),
+ ('Maple', 4416),
+ ('Muffin', 4365),
+ ('Cocktail', 4234),
+ ('Castle', 4197),
+ ('Couch', 4134),
+ ('Pumpkin', 3983),
+ ('Computer keyboard', 3960),
+ ('Human mouth', 3926),
+ ('Christmas tree', 3893),
+ ('Mushroom', 3883),
+ ('Swimming pool', 3809),
+ ('Pastry', 3799),
+ ('Lavender (Plant)', 3769),
+ ('Football helmet', 3732),
+ ('Bread', 3648),
+ ('Traffic sign', 3628),
+ ('Common sunflower', 3597),
+ ('Television', 3550),
+ ('Bed', 3525),
+ ('Cookie', 3485),
+ ('Fountain', 3484),
+ ('Paddle', 3447),
+ ('Bicycle helmet', 3429),
+ ('Porch', 3420),
+ ('Deer', 3387),
+ ('Fedora', 3339),
+ ('Canoe', 3338),
+ ('Carnivore', 3266),
+ ('Bowl', 3202),
+ ('Human eye', 3166),
+ ('Ball', 3118),
+ ('Pillow', 3077),
+ ('Salad', 3061),
+ ('Beetle', 3060),
+ ('Orange', 3050),
+ ('Drawer', 2958),
+ ('Platter', 2937),
+ ('Elephant', 2921),
+ ('Seafood', 2921),
+ ('Monkey', 2915),
+ ('Countertop', 2879),
+ ('Watercraft', 2831),
+ ('Helicopter', 2805),
+ ('Kitchen appliance', 2797),
+ ('Personal flotation device', 2781),
+ ('Swan', 2739),
+ ('Lamp', 2711),
+ ('Boot', 2695),
+ ('Bronze sculpture', 2693),
+ ('Chicken', 2677),
+ ('Taxi', 2643),
+ ('Juice', 2615),
+ ('Cowboy hat', 2604),
+ ('Apple', 2600),
+ ('Tin can', 2590),
+ ('Necklace', 2564),
+ ('Ice cream', 2560),
+ ('Human beard', 2539),
+ ('Coin', 2536),
+ ('Candle', 2515),
+ ('Cart', 2512),
+ ('High heels', 2441),
+ ('Weapon', 2433),
+ ('Handbag', 2406),
+ ('Penguin', 2396),
+ ('Rifle', 2352),
+ ('Violin', 2336),
+ ('Skull', 2304),
+ ('Lantern', 2285),
+ ('Scarf', 2269),
+ ('Saucer', 2225),
+ ('Sheep', 2215),
+ ('Vase', 2189),
+ ('Lily', 2180),
+ ('Mug', 2154),
+ ('Parrot', 2140),
+ ('Human ear', 2137),
+ ('Sandal', 2115),
+ ('Lizard', 2100),
+ ('Kitchen & dining room table', 2063),
+ ('Spider', 1977),
+ ('Coffee', 1974),
+ ('Goat', 1926),
+ ('Squirrel', 1922),
+ ('Cello', 1913),
+ ('Sushi', 1881),
+ ('Tortoise', 1876),
+ ('Pizza', 1870),
+ ('Studio couch', 1864),
+ ('Barrel', 1862),
+ ('Cosmetics', 1841),
+ ('Moths and butterflies', 1841),
+ ('Convenience store', 1817),
+ ('Watch', 1792),
+ ('Home appliance', 1786),
+ ('Harbor seal', 1780),
+ ('Luggage and bags', 1756),
+ ('Vehicle registration plate', 1754),
+ ('Shrimp', 1751),
+ ('Jellyfish', 1730),
+ ('French fries', 1723),
+ ('Egg (Food)', 1698),
+ ('Football', 1697),
+ ('Musical keyboard', 1683),
+ ('Falcon', 1674),
+ ('Candy', 1660),
+ ('Medical equipment', 1654),
+ ('Eagle', 1651),
+ ('Dinosaur', 1634),
+ ('Surfboard', 1630),
+ ('Tank', 1628),
+ ('Grape', 1624),
+ ('Lion', 1624),
+ ('Owl', 1622),
+ ('Ski', 1613),
+ ('Waste container', 1606),
+ ('Frog', 1591),
+ ('Sparrow', 1585),
+ ('Rabbit', 1581),
+ ('Pen', 1546),
+ ('Sea lion', 1537),
+ ('Spoon', 1521),
+ ('Sink', 1512),
+ ('Teddy bear', 1507),
+ ('Bull', 1495),
+ ('Sofa bed', 1490),
+ ('Dragonfly', 1479),
+ ('Brassiere', 1478),
+ ('Chest of drawers', 1472),
+ ('Aircraft', 1466),
+ ('Human foot', 1463),
+ ('Pig', 1455),
+ ('Fork', 1454),
+ ('Antelope', 1438),
+ ('Tripod', 1427),
+ ('Tool', 1424),
+ ('Cheese', 1422),
+ ('Lemon', 1397),
+ ('Hamburger', 1393),
+ ('Dolphin', 1390),
+ ('Mirror', 1390),
+ ('Marine mammal', 1387),
+ ('Giraffe', 1385),
+ ('Snake', 1368),
+ ('Gondola', 1364),
+ ('Wheelchair', 1360),
+ ('Piano', 1358),
+ ('Cupboard', 1348),
+ ('Banana', 1345),
+ ('Trumpet', 1335),
+ ('Lighthouse', 1333),
+ ('Invertebrate', 1317),
+ ('Carrot', 1268),
+ ('Sock', 1260),
+ ('Tiger', 1241),
+ ('Camel', 1224),
+ ('Parachute', 1224),
+ ('Bathroom accessory', 1223),
+ ('Earrings', 1221),
+ ('Headphones', 1218),
+ ('Skirt', 1198),
+ ('Skateboard', 1190),
+ ('Sandwich', 1148),
+ ('Saxophone', 1141),
+ ('Goldfish', 1136),
+ ('Stool', 1104),
+ ('Traffic light', 1097),
+ ('Shellfish', 1081),
+ ('Backpack', 1079),
+ ('Sea turtle', 1078),
+ ('Cucumber', 1075),
+ ('Tea', 1051),
+ ('Toilet', 1047),
+ ('Roller skates', 1040),
+ ('Mule', 1039),
+ ('Bust', 1031),
+ ('Broccoli', 1030),
+ ('Crab', 1020),
+ ('Oyster', 1019),
+ ('Cannon', 1012),
+ ('Zebra', 1012),
+ ('French horn', 1008),
+ ('Grapefruit', 998),
+ ('Whiteboard', 997),
+ ('Zucchini', 997),
+ ('Crocodile', 992),
+
+ ('Clock', 960),
+ ('Wall clock', 958),
+
+ ('Doughnut', 869),
+ ('Snail', 868),
+
+ ('Baseball glove', 859),
+
+ ('Panda', 830),
+ ('Tennis racket', 830),
+
+ ('Pear', 652),
+
+ ('Bagel', 617),
+ ('Oven', 616),
+ ('Ladybug', 615),
+ ('Shark', 615),
+ ('Polar bear', 614),
+ ('Ostrich', 609),
+
+ ('Hot dog', 473),
+ ('Microwave oven', 467),
+ ('Fire hydrant', 20),
+ ('Stop sign', 20),
+ ('Parking meter', 20),
+ ('Bear', 20),
+ ('Flying disc', 20),
+ ('Snowboard', 20),
+ ('Tennis ball', 20),
+ ('Kite', 20),
+ ('Baseball bat', 20),
+ ('Kitchen knife', 20),
+ ('Knife', 20),
+ ('Submarine sandwich', 20),
+ ('Computer mouse', 20),
+ ('Remote control', 20),
+ ('Toaster', 20),
+ ('Sink', 20),
+ ('Refrigerator', 20),
+ ('Alarm clock', 20),
+ ('Wall clock', 20),
+ ('Scissors', 20),
+ ('Hair dryer', 20),
+ ('Toothbrush', 20),
+ ('Suitcase', 20)
+]
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/sflckr.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/sflckr.py
new file mode 100644
index 0000000000000000000000000000000000000000..282364e26ed3de3317f30f95fce4d4b3e3bae055
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/sflckr.py
@@ -0,0 +1,91 @@
+import os
+import numpy as np
+import cv2
+import custom_albumentations as albumentations
+from PIL import Image
+from torch.utils.data import Dataset
+
+
+class SegmentationBase(Dataset):
+ def __init__(self,
+ data_csv, data_root, segmentation_root,
+ size=None, random_crop=False, interpolation="bicubic",
+ n_labels=182, shift_segmentation=False,
+ ):
+ self.n_labels = n_labels
+ self.shift_segmentation = shift_segmentation
+ self.data_csv = data_csv
+ self.data_root = data_root
+ self.segmentation_root = segmentation_root
+ with open(self.data_csv, "r") as f:
+ self.image_paths = f.read().splitlines()
+ self._length = len(self.image_paths)
+ self.labels = {
+ "relative_file_path_": [l for l in self.image_paths],
+ "file_path_": [os.path.join(self.data_root, l)
+ for l in self.image_paths],
+ "segmentation_path_": [os.path.join(self.segmentation_root, l.replace(".jpg", ".png"))
+ for l in self.image_paths]
+ }
+
+ size = None if size is not None and size<=0 else size
+ self.size = size
+ if self.size is not None:
+ self.interpolation = interpolation
+ self.interpolation = {
+ "nearest": cv2.INTER_NEAREST,
+ "bilinear": cv2.INTER_LINEAR,
+ "bicubic": cv2.INTER_CUBIC,
+ "area": cv2.INTER_AREA,
+ "lanczos": cv2.INTER_LANCZOS4}[self.interpolation]
+ self.image_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+ interpolation=self.interpolation)
+ self.segmentation_rescaler = albumentations.SmallestMaxSize(max_size=self.size,
+ interpolation=cv2.INTER_NEAREST)
+ self.center_crop = not random_crop
+ if self.center_crop:
+ self.cropper = albumentations.CenterCrop(height=self.size, width=self.size)
+ else:
+ self.cropper = albumentations.RandomCrop(height=self.size, width=self.size)
+ self.preprocessor = self.cropper
+
+ def __len__(self):
+ return self._length
+
+ def __getitem__(self, i):
+ example = dict((k, self.labels[k][i]) for k in self.labels)
+ image = Image.open(example["file_path_"])
+ if not image.mode == "RGB":
+ image = image.convert("RGB")
+ image = np.array(image).astype(np.uint8)
+ if self.size is not None:
+ image = self.image_rescaler(image=image)["image"]
+ segmentation = Image.open(example["segmentation_path_"])
+ assert segmentation.mode == "L", segmentation.mode
+ segmentation = np.array(segmentation).astype(np.uint8)
+ if self.shift_segmentation:
+ # used to support segmentations containing unlabeled==255 label
+ segmentation = segmentation+1
+ if self.size is not None:
+ segmentation = self.segmentation_rescaler(image=segmentation)["image"]
+ if self.size is not None:
+ processed = self.preprocessor(image=image,
+ mask=segmentation
+ )
+ else:
+ processed = {"image": image,
+ "mask": segmentation
+ }
+ example["image"] = (processed["image"]/127.5 - 1.0).astype(np.float32)
+ segmentation = processed["mask"]
+ onehot = np.eye(self.n_labels)[segmentation]
+ example["segmentation"] = onehot
+ return example
+
+
+class Examples(SegmentationBase):
+ def __init__(self, size=None, random_crop=False, interpolation="bicubic"):
+ super().__init__(data_csv="data/sflckr_examples.txt",
+ data_root="data/sflckr_images",
+ segmentation_root="data/sflckr_segmentations",
+ size=size, random_crop=random_crop, interpolation=interpolation)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..36407f1bda26b9a5314690a3160d0389a1d59f68
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/data/utils.py
@@ -0,0 +1,169 @@
+import collections
+import os
+import tarfile
+import urllib
+import zipfile
+from pathlib import Path
+
+import numpy as np
+import torch
+from custom_controlnet_aux.diffusion_edge.taming.data.helper_types import Annotation
+from torch._six import string_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format
+from tqdm import tqdm
+
+
+def unpack(path):
+ if path.endswith("tar.gz"):
+ with tarfile.open(path, "r:gz") as tar:
+ tar.extractall(path=os.path.split(path)[0])
+ elif path.endswith("tar"):
+ with tarfile.open(path, "r:") as tar:
+ tar.extractall(path=os.path.split(path)[0])
+ elif path.endswith("zip"):
+ with zipfile.ZipFile(path, "r") as f:
+ f.extractall(path=os.path.split(path)[0])
+ else:
+ raise NotImplementedError(
+ "Unknown file extension: {}".format(os.path.splitext(path)[1])
+ )
+
+
+def reporthook(bar):
+ """tqdm progress bar for downloads."""
+
+ def hook(b=1, bsize=1, tsize=None):
+ if tsize is not None:
+ bar.total = tsize
+ bar.update(b * bsize - bar.n)
+
+ return hook
+
+
+def get_root(name):
+ base = "data/"
+ root = os.path.join(base, name)
+ os.makedirs(root, exist_ok=True)
+ return root
+
+
+def is_prepared(root):
+ return Path(root).joinpath(".ready").exists()
+
+
+def mark_prepared(root):
+ Path(root).joinpath(".ready").touch()
+
+
+def prompt_download(file_, source, target_dir, content_dir=None):
+ targetpath = os.path.join(target_dir, file_)
+ while not os.path.exists(targetpath):
+ if content_dir is not None and os.path.exists(
+ os.path.join(target_dir, content_dir)
+ ):
+ break
+ print(
+ "Please download '{}' from '{}' to '{}'.".format(file_, source, targetpath)
+ )
+ if content_dir is not None:
+ print(
+ "Or place its content into '{}'.".format(
+ os.path.join(target_dir, content_dir)
+ )
+ )
+ input("Press Enter when done...")
+ return targetpath
+
+
+def download_url(file_, url, target_dir):
+ targetpath = os.path.join(target_dir, file_)
+ os.makedirs(target_dir, exist_ok=True)
+ with tqdm(
+ unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=file_
+ ) as bar:
+ urllib.request.urlretrieve(url, targetpath, reporthook=reporthook(bar))
+ return targetpath
+
+
+def download_urls(urls, target_dir):
+ paths = dict()
+ for fname, url in urls.items():
+ outpath = download_url(fname, url, target_dir)
+ paths[fname] = outpath
+ return paths
+
+
+def quadratic_crop(x, bbox, alpha=1.0):
+ """bbox is xmin, ymin, xmax, ymax"""
+ im_h, im_w = x.shape[:2]
+ bbox = np.array(bbox, dtype=np.float32)
+ bbox = np.clip(bbox, 0, max(im_h, im_w))
+ center = 0.5 * (bbox[0] + bbox[2]), 0.5 * (bbox[1] + bbox[3])
+ w = bbox[2] - bbox[0]
+ h = bbox[3] - bbox[1]
+ l = int(alpha * max(w, h))
+ l = max(l, 2)
+
+ required_padding = -1 * min(
+ center[0] - l, center[1] - l, im_w - (center[0] + l), im_h - (center[1] + l)
+ )
+ required_padding = int(np.ceil(required_padding))
+ if required_padding > 0:
+ padding = [
+ [required_padding, required_padding],
+ [required_padding, required_padding],
+ ]
+ padding += [[0, 0]] * (len(x.shape) - 2)
+ x = np.pad(x, padding, "reflect")
+ center = center[0] + required_padding, center[1] + required_padding
+ xmin = int(center[0] - l / 2)
+ ymin = int(center[1] - l / 2)
+ return np.array(x[ymin : ymin + l, xmin : xmin + l, ...])
+
+
+def custom_collate(batch):
+ r"""source: pytorch 1.9.0, only one modification to original code """
+
+ elem = batch[0]
+ elem_type = type(elem)
+ if isinstance(elem, torch.Tensor):
+ out = None
+ if torch.utils.data.get_worker_info() is not None:
+ # If we're in a background process, concatenate directly into a
+ # shared memory tensor to avoid an extra copy
+ numel = sum([x.numel() for x in batch])
+ storage = elem.storage()._new_shared(numel)
+ out = elem.new(storage)
+ return torch.stack(batch, 0, out=out)
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+ and elem_type.__name__ != 'string_':
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+ # array of string classes and object
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+ return custom_collate([torch.as_tensor(b) for b in batch])
+ elif elem.shape == (): # scalars
+ return torch.as_tensor(batch)
+ elif isinstance(elem, float):
+ return torch.tensor(batch, dtype=torch.float64)
+ elif isinstance(elem, int):
+ return torch.tensor(batch)
+ elif isinstance(elem, string_classes):
+ return batch
+ elif isinstance(elem, collections.abc.Mapping):
+ return {key: custom_collate([d[key] for d in batch]) for key in elem}
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
+ return elem_type(*(custom_collate(samples) for samples in zip(*batch)))
+ if isinstance(elem, collections.abc.Sequence) and isinstance(elem[0], Annotation): # added
+ return batch # added
+ elif isinstance(elem, collections.abc.Sequence):
+ # check to make sure that the elements in batch have consistent size
+ it = iter(batch)
+ elem_size = len(next(it))
+ if not all(len(elem) == elem_size for elem in it):
+ raise RuntimeError('each element in list of batch should be of equal size')
+ transposed = zip(*batch)
+ return [custom_collate(samples) for samples in transposed]
+
+ raise TypeError(default_collate_err_msg_format.format(elem_type))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/autoencoder/lpips/vgg.pth b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/autoencoder/lpips/vgg.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f57dcf5cc764d61c8a460365847fb2137ff0a62d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/autoencoder/lpips/vgg.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a78928a0af1e5f0fcb1f3b9e8f8c3a2a5a3de244d830ad5c1feddc79b8432868
+size 7289
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/diffusionmodules/model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/diffusionmodules/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f142487e5e12cdf6fde616146a271f0d6eb67f63
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/diffusionmodules/model.py
@@ -0,0 +1,776 @@
+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+ """
+ This matches the implementation in Denoising Diffusion Probabilistic Models:
+ From Fairseq.
+ Build sinusoidal embeddings.
+ This matches the implementation in tensor2tensor, but differs slightly
+ from the description in Section 3.5 of "Attention Is All You Need".
+ """
+ assert len(timesteps.shape) == 1
+
+ half_dim = embedding_dim // 2
+ emb = math.log(10000) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+ emb = emb.to(device=timesteps.device)
+ emb = timesteps.float()[:, None] * emb[None, :]
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+ if embedding_dim % 2 == 1: # zero pad
+ emb = torch.nn.functional.pad(emb, (0,1,0,0))
+ return emb
+
+
+def nonlinearity(x):
+ # swish
+ return x*torch.sigmoid(x)
+
+
+def Normalize(in_channels):
+ return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class Upsample(nn.Module):
+ def __init__(self, in_channels, with_conv):
+ super().__init__()
+ self.with_conv = with_conv
+ if self.with_conv:
+ self.conv = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+ if self.with_conv:
+ x = self.conv(x)
+ return x
+
+
+class Downsample(nn.Module):
+ def __init__(self, in_channels, with_conv):
+ super().__init__()
+ self.with_conv = with_conv
+ if self.with_conv:
+ # no asymmetric padding in torch conv, must do it ourselves
+ self.conv = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=3,
+ stride=2,
+ padding=0)
+
+ def forward(self, x):
+ if self.with_conv:
+ pad = (0,1,0,1)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ else:
+ x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+ return x
+
+
+class ResnetBlock(nn.Module):
+ def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+ dropout, temb_channels=512):
+ super().__init__()
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+
+ self.norm1 = Normalize(in_channels)
+ self.conv1 = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ if temb_channels > 0:
+ self.temb_proj = torch.nn.Linear(temb_channels,
+ out_channels)
+ self.norm2 = Normalize(out_channels)
+ self.dropout = torch.nn.Dropout(dropout)
+ self.conv2 = torch.nn.Conv2d(out_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ self.conv_shortcut = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ else:
+ self.nin_shortcut = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+
+ def forward(self, x, temb):
+ h = x
+ h = self.norm1(h)
+ h = nonlinearity(h)
+ h = self.conv1(h)
+
+ if temb is not None:
+ h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+
+ h = self.norm2(h)
+ h = nonlinearity(h)
+ h = self.dropout(h)
+ h = self.conv2(h)
+
+ if self.in_channels != self.out_channels:
+ if self.use_conv_shortcut:
+ x = self.conv_shortcut(x)
+ else:
+ x = self.nin_shortcut(x)
+
+ return x+h
+
+
+class AttnBlock(nn.Module):
+ def __init__(self, in_channels):
+ super().__init__()
+ self.in_channels = in_channels
+
+ self.norm = Normalize(in_channels)
+ self.q = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.k = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.v = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.proj_out = torch.nn.Conv2d(in_channels,
+ in_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+
+
+ def forward(self, x):
+ h_ = x
+ h_ = self.norm(h_)
+ q = self.q(h_)
+ k = self.k(h_)
+ v = self.v(h_)
+
+ # compute attention
+ b,c,h,w = q.shape
+ q = q.reshape(b,c,h*w)
+ q = q.permute(0,2,1) # b,hw,c
+ k = k.reshape(b,c,h*w) # b,c,hw
+ w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+ w_ = w_ * (int(c)**(-0.5))
+ w_ = torch.nn.functional.softmax(w_, dim=2)
+
+ # attend to values
+ v = v.reshape(b,c,h*w)
+ w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
+ h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+ h_ = h_.reshape(b,c,h,w)
+
+ h_ = self.proj_out(h_)
+
+ return x+h_
+
+
+class Model(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+ resolution, use_timestep=True):
+ super().__init__()
+ self.ch = ch
+ self.temb_ch = self.ch*4
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+ self.in_channels = in_channels
+
+ self.use_timestep = use_timestep
+ if self.use_timestep:
+ # timestep embedding
+ self.temb = nn.Module()
+ self.temb.dense = nn.ModuleList([
+ torch.nn.Linear(self.ch,
+ self.temb_ch),
+ torch.nn.Linear(self.temb_ch,
+ self.temb_ch),
+ ])
+
+ # downsampling
+ self.conv_in = torch.nn.Conv2d(in_channels,
+ self.ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ curr_res = resolution
+ in_ch_mult = (1,)+tuple(ch_mult)
+ self.down = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_in = ch*in_ch_mult[i_level]
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(AttnBlock(block_in))
+ down = nn.Module()
+ down.block = block
+ down.attn = attn
+ if i_level != self.num_resolutions-1:
+ down.downsample = Downsample(block_in, resamp_with_conv)
+ curr_res = curr_res // 2
+ self.down.append(down)
+
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = AttnBlock(block_in)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # upsampling
+ self.up = nn.ModuleList()
+ for i_level in reversed(range(self.num_resolutions)):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_out = ch*ch_mult[i_level]
+ skip_in = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks+1):
+ if i_block == self.num_res_blocks:
+ skip_in = ch*in_ch_mult[i_level]
+ block.append(ResnetBlock(in_channels=block_in+skip_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(AttnBlock(block_in))
+ up = nn.Module()
+ up.block = block
+ up.attn = attn
+ if i_level != 0:
+ up.upsample = Upsample(block_in, resamp_with_conv)
+ curr_res = curr_res * 2
+ self.up.insert(0, up) # prepend to get consistent order
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+
+ def forward(self, x, t=None):
+ #assert x.shape[2] == x.shape[3] == self.resolution
+
+ if self.use_timestep:
+ # timestep embedding
+ assert t is not None
+ temb = get_timestep_embedding(t, self.ch)
+ temb = self.temb.dense[0](temb)
+ temb = nonlinearity(temb)
+ temb = self.temb.dense[1](temb)
+ else:
+ temb = None
+
+ # downsampling
+ hs = [self.conv_in(x)]
+ for i_level in range(self.num_resolutions):
+ for i_block in range(self.num_res_blocks):
+ h = self.down[i_level].block[i_block](hs[-1], temb)
+ if len(self.down[i_level].attn) > 0:
+ h = self.down[i_level].attn[i_block](h)
+ hs.append(h)
+ if i_level != self.num_resolutions-1:
+ hs.append(self.down[i_level].downsample(hs[-1]))
+
+ # middle
+ h = hs[-1]
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # upsampling
+ for i_level in reversed(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks+1):
+ h = self.up[i_level].block[i_block](
+ torch.cat([h, hs.pop()], dim=1), temb)
+ if len(self.up[i_level].attn) > 0:
+ h = self.up[i_level].attn[i_block](h)
+ if i_level != 0:
+ h = self.up[i_level].upsample(h)
+
+ # end
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class Encoder(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+ resolution, z_channels, double_z=True, **ignore_kwargs):
+ super().__init__()
+ self.ch = ch
+ self.temb_ch = 0
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+ self.in_channels = in_channels
+
+ # downsampling
+ self.conv_in = torch.nn.Conv2d(in_channels,
+ self.ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ curr_res = resolution
+ in_ch_mult = (1,)+tuple(ch_mult)
+ self.down = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_in = ch*in_ch_mult[i_level]
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(AttnBlock(block_in))
+ down = nn.Module()
+ down.block = block
+ down.attn = attn
+ if i_level != self.num_resolutions-1:
+ down.downsample = Downsample(block_in, resamp_with_conv)
+ curr_res = curr_res // 2
+ self.down.append(down)
+
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = AttnBlock(block_in)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ 2*z_channels if double_z else z_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+
+ def forward(self, x):
+ #assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+
+ # timestep embedding
+ temb = None
+
+ # downsampling
+ hs = [self.conv_in(x)]
+ for i_level in range(self.num_resolutions):
+ for i_block in range(self.num_res_blocks):
+ h = self.down[i_level].block[i_block](hs[-1], temb)
+ if len(self.down[i_level].attn) > 0:
+ h = self.down[i_level].attn[i_block](h)
+ hs.append(h)
+ if i_level != self.num_resolutions-1:
+ hs.append(self.down[i_level].downsample(hs[-1]))
+
+ # middle
+ h = hs[-1]
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # end
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class Decoder(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+ resolution, z_channels, give_pre_end=False, **ignorekwargs):
+ super().__init__()
+ self.ch = ch
+ self.temb_ch = 0
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+ self.in_channels = in_channels
+ self.give_pre_end = give_pre_end
+
+ # compute in_ch_mult, block_in and curr_res at lowest res
+ in_ch_mult = (1,)+tuple(ch_mult)
+ block_in = ch*ch_mult[self.num_resolutions-1]
+ curr_res = resolution // 2**(self.num_resolutions-1)
+ self.z_shape = (1,z_channels,curr_res,curr_res)
+ print("Working with z of shape {} = {} dimensions.".format(
+ self.z_shape, np.prod(self.z_shape)))
+
+ # z to block_in
+ self.conv_in = torch.nn.Conv2d(z_channels,
+ block_in,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = AttnBlock(block_in)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # upsampling
+ self.up = nn.ModuleList()
+ for i_level in reversed(range(self.num_resolutions)):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks+1):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(AttnBlock(block_in))
+ up = nn.Module()
+ up.block = block
+ up.attn = attn
+ if i_level != 0:
+ up.upsample = Upsample(block_in, resamp_with_conv)
+ curr_res = curr_res * 2
+ self.up.insert(0, up) # prepend to get consistent order
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, z):
+ #assert z.shape[1:] == self.z_shape[1:]
+ self.last_z_shape = z.shape
+
+ # timestep embedding
+ temb = None
+
+ # z to block_in
+ h = self.conv_in(z)
+
+ # middle
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # upsampling
+ for i_level in reversed(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks+1):
+ h = self.up[i_level].block[i_block](h, temb)
+ if len(self.up[i_level].attn) > 0:
+ h = self.up[i_level].attn[i_block](h)
+ if i_level != 0:
+ h = self.up[i_level].upsample(h)
+
+ # end
+ if self.give_pre_end:
+ return h
+
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class VUNet(nn.Module):
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+ attn_resolutions, dropout=0.0, resamp_with_conv=True,
+ in_channels, c_channels,
+ resolution, z_channels, use_timestep=False, **ignore_kwargs):
+ super().__init__()
+ self.ch = ch
+ self.temb_ch = self.ch*4
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ self.resolution = resolution
+
+ self.use_timestep = use_timestep
+ if self.use_timestep:
+ # timestep embedding
+ self.temb = nn.Module()
+ self.temb.dense = nn.ModuleList([
+ torch.nn.Linear(self.ch,
+ self.temb_ch),
+ torch.nn.Linear(self.temb_ch,
+ self.temb_ch),
+ ])
+
+ # downsampling
+ self.conv_in = torch.nn.Conv2d(c_channels,
+ self.ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ curr_res = resolution
+ in_ch_mult = (1,)+tuple(ch_mult)
+ self.down = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_in = ch*in_ch_mult[i_level]
+ block_out = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks):
+ block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(AttnBlock(block_in))
+ down = nn.Module()
+ down.block = block
+ down.attn = attn
+ if i_level != self.num_resolutions-1:
+ down.downsample = Downsample(block_in, resamp_with_conv)
+ curr_res = curr_res // 2
+ self.down.append(down)
+
+ self.z_in = torch.nn.Conv2d(z_channels,
+ block_in,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ # middle
+ self.mid = nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=2*block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+ self.mid.attn_1 = AttnBlock(block_in)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
+ out_channels=block_in,
+ temb_channels=self.temb_ch,
+ dropout=dropout)
+
+ # upsampling
+ self.up = nn.ModuleList()
+ for i_level in reversed(range(self.num_resolutions)):
+ block = nn.ModuleList()
+ attn = nn.ModuleList()
+ block_out = ch*ch_mult[i_level]
+ skip_in = ch*ch_mult[i_level]
+ for i_block in range(self.num_res_blocks+1):
+ if i_block == self.num_res_blocks:
+ skip_in = ch*in_ch_mult[i_level]
+ block.append(ResnetBlock(in_channels=block_in+skip_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ if curr_res in attn_resolutions:
+ attn.append(AttnBlock(block_in))
+ up = nn.Module()
+ up.block = block
+ up.attn = attn
+ if i_level != 0:
+ up.upsample = Upsample(block_in, resamp_with_conv)
+ curr_res = curr_res * 2
+ self.up.insert(0, up) # prepend to get consistent order
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_ch,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+
+ def forward(self, x, z):
+ #assert x.shape[2] == x.shape[3] == self.resolution
+
+ if self.use_timestep:
+ # timestep embedding
+ assert t is not None
+ temb = get_timestep_embedding(t, self.ch)
+ temb = self.temb.dense[0](temb)
+ temb = nonlinearity(temb)
+ temb = self.temb.dense[1](temb)
+ else:
+ temb = None
+
+ # downsampling
+ hs = [self.conv_in(x)]
+ for i_level in range(self.num_resolutions):
+ for i_block in range(self.num_res_blocks):
+ h = self.down[i_level].block[i_block](hs[-1], temb)
+ if len(self.down[i_level].attn) > 0:
+ h = self.down[i_level].attn[i_block](h)
+ hs.append(h)
+ if i_level != self.num_resolutions-1:
+ hs.append(self.down[i_level].downsample(hs[-1]))
+
+ # middle
+ h = hs[-1]
+ z = self.z_in(z)
+ h = torch.cat((h,z),dim=1)
+ h = self.mid.block_1(h, temb)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h, temb)
+
+ # upsampling
+ for i_level in reversed(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks+1):
+ h = self.up[i_level].block[i_block](
+ torch.cat([h, hs.pop()], dim=1), temb)
+ if len(self.up[i_level].attn) > 0:
+ h = self.up[i_level].attn[i_block](h)
+ if i_level != 0:
+ h = self.up[i_level].upsample(h)
+
+ # end
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
+
+class SimpleDecoder(nn.Module):
+ def __init__(self, in_channels, out_channels, *args, **kwargs):
+ super().__init__()
+ self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+ ResnetBlock(in_channels=in_channels,
+ out_channels=2 * in_channels,
+ temb_channels=0, dropout=0.0),
+ ResnetBlock(in_channels=2 * in_channels,
+ out_channels=4 * in_channels,
+ temb_channels=0, dropout=0.0),
+ ResnetBlock(in_channels=4 * in_channels,
+ out_channels=2 * in_channels,
+ temb_channels=0, dropout=0.0),
+ nn.Conv2d(2*in_channels, in_channels, 1),
+ Upsample(in_channels, with_conv=True)])
+ # end
+ self.norm_out = Normalize(in_channels)
+ self.conv_out = torch.nn.Conv2d(in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ for i, layer in enumerate(self.model):
+ if i in [1,2,3]:
+ x = layer(x, None)
+ else:
+ x = layer(x)
+
+ h = self.norm_out(x)
+ h = nonlinearity(h)
+ x = self.conv_out(h)
+ return x
+
+
+class UpsampleDecoder(nn.Module):
+ def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+ ch_mult=(2,2), dropout=0.0):
+ super().__init__()
+ # upsampling
+ self.temb_ch = 0
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ block_in = in_channels
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
+ self.res_blocks = nn.ModuleList()
+ self.upsample_blocks = nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ res_block = []
+ block_out = ch * ch_mult[i_level]
+ for i_block in range(self.num_res_blocks + 1):
+ res_block.append(ResnetBlock(in_channels=block_in,
+ out_channels=block_out,
+ temb_channels=self.temb_ch,
+ dropout=dropout))
+ block_in = block_out
+ self.res_blocks.append(nn.ModuleList(res_block))
+ if i_level != self.num_resolutions - 1:
+ self.upsample_blocks.append(Upsample(block_in, True))
+ curr_res = curr_res * 2
+
+ # end
+ self.norm_out = Normalize(block_in)
+ self.conv_out = torch.nn.Conv2d(block_in,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ # upsampling
+ h = x
+ for k, i_level in enumerate(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks + 1):
+ h = self.res_blocks[i_level][i_block](h, None)
+ if i_level != self.num_resolutions - 1:
+ h = self.upsample_blocks[k](h)
+ h = self.norm_out(h)
+ h = nonlinearity(h)
+ h = self.conv_out(h)
+ return h
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/discriminator/model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/discriminator/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec41787d774815da1a11f69b76ddccafab4c9b21
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/discriminator/model.py
@@ -0,0 +1,131 @@
+import functools
+import torch.nn as nn
+
+
+from custom_controlnet_aux.diffusion_edge.taming.modules.util import ActNorm
+
+
+def weights_init(m):
+ classname = m.__class__.__name__
+ if classname.find('Conv') != -1:
+ nn.init.normal_(m.weight.data, 0.0, 0.02)
+ elif classname.find('BatchNorm') != -1:
+ nn.init.normal_(m.weight.data, 1.0, 0.02)
+ nn.init.constant_(m.bias.data, 0)
+
+
+class NLayerDiscriminator(nn.Module):
+ """Defines a PatchGAN discriminator as in Pix2Pix
+ --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+ """
+ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+ """Construct a PatchGAN discriminator
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ ndf (int) -- the number of filters in the last conv layer
+ n_layers (int) -- the number of conv layers in the discriminator
+ norm_layer -- normalization layer
+ """
+ super(NLayerDiscriminator, self).__init__()
+ if not use_actnorm:
+ norm_layer = nn.BatchNorm2d
+ else:
+ norm_layer = ActNorm
+ if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
+ use_bias = norm_layer.func != nn.BatchNorm2d
+ else:
+ use_bias = norm_layer != nn.BatchNorm2d
+
+ kw = 4
+ padw = 1
+ sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+ nf_mult = 1
+ nf_mult_prev = 1
+ for n in range(1, n_layers): # gradually increase the number of filters
+ nf_mult_prev = nf_mult
+ nf_mult = min(2 ** n, 8)
+ sequence += [
+ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+ norm_layer(ndf * nf_mult),
+ nn.LeakyReLU(0.2, True)
+ ]
+
+ nf_mult_prev = nf_mult
+ nf_mult = min(2 ** n_layers, 8)
+ sequence += [
+ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+ norm_layer(ndf * nf_mult),
+ nn.LeakyReLU(0.2, True)
+ ]
+
+ sequence += [
+ nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)] # output 1 channel prediction map
+ self.main = nn.Sequential(*sequence)
+
+ def forward(self, input):
+ """Standard forward."""
+ return self.main(input)
+
+class NLayerDiscriminator2(nn.Module):
+ """Defines a PatchGAN discriminator as in Pix2Pix
+ --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+ """
+ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+ """Construct a PatchGAN discriminator
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ ndf (int) -- the number of filters in the last conv layer
+ n_layers (int) -- the number of conv layers in the discriminator
+ norm_layer -- normalization layer
+ """
+ super(NLayerDiscriminator2, self).__init__()
+ if not use_actnorm:
+ norm_layer = nn.BatchNorm3d
+ else:
+ norm_layer = ActNorm
+ if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
+ use_bias = norm_layer.func != nn.BatchNorm3d
+ else:
+ use_bias = norm_layer != nn.BatchNorm3d
+
+ kw = 4
+ padw = 1
+ sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+ nf_mult = 1
+ nf_mult_prev = 1
+ for n in range(1, n_layers): # gradually increase the number of filters
+ nf_mult_prev = nf_mult
+ nf_mult = min(2 ** n, 8)
+ sequence += [
+ nn.Conv3d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2,
+ padding=padw, bias=use_bias, groups=8),
+ norm_layer(ndf * nf_mult),
+ nn.LeakyReLU(0.2, True)
+ ]
+
+ nf_mult_prev = nf_mult
+ nf_mult = min(2 ** n_layers, 8)
+ sequence += [
+ nn.Conv3d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1,
+ padding=padw, bias=use_bias, groups=8),
+ norm_layer(ndf * nf_mult),
+ nn.LeakyReLU(0.2, True)
+ ]
+
+ sequence += [
+ nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw),
+ # nn.Sigmoid()
+ ] # output 1 channel prediction map
+ self.main = nn.Sequential(*sequence)
+
+ def forward(self, input):
+ """Standard forward."""
+ return self.main(input)
+
+if __name__ == "__main__":
+ import torch
+ model = NLayerDiscriminator2(input_nc=3, ndf=64, n_layers=3)
+ x = torch.rand(1, 3, 64, 64, 64)
+ with torch.no_grad():
+ y = model(x)
+ pause = 0
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..020d5bb9e55e6503a162bb3b8b44640f351d2ff8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/__init__.py
@@ -0,0 +1,2 @@
+from custom_controlnet_aux.diffusion_edge.taming.modules.losses.vqperceptual import DummyLoss
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/lpips.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/lpips.py
new file mode 100644
index 0000000000000000000000000000000000000000..1388605a93ae94a95f734f8cdce0f464896d923d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/lpips.py
@@ -0,0 +1,126 @@
+"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+
+import torch
+import torch.nn as nn
+from torchvision import models
+from collections import namedtuple
+
+from .util import get_ckpt_path
+
+from custom_controlnet_aux.util import custom_torch_download
+
+class LPIPS(nn.Module):
+ # Learned perceptual metric
+ def __init__(self, use_dropout=True):
+ super().__init__()
+ self.scaling_layer = ScalingLayer()
+ self.chns = [64, 128, 256, 512, 512] # vg16 features
+ self.net = vgg16(pretrained=False, requires_grad=False)
+ self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+ self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+ self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+ self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+ self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+ self.load_from_pretrained()
+ for param in self.parameters():
+ param.requires_grad = False
+
+ def load_from_pretrained(self, name="vgg_lpips"):
+ ckpt = get_ckpt_path(name, "taming/modules/autoencoder/lpips")
+ self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+ print("loaded pretrained LPIPS loss from {}".format(ckpt))
+
+ @classmethod
+ def from_pretrained(cls, name="vgg_lpips"):
+ if name != "vgg_lpips":
+ raise NotImplementedError
+ model = cls()
+ ckpt = get_ckpt_path(name)
+ model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+ return model
+
+ def forward(self, input, target):
+ in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+ outs0, outs1 = self.net(in0_input), self.net(in1_input)
+ feats0, feats1, diffs = {}, {}, {}
+ lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+ for kk in range(len(self.chns)):
+ feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+ diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+
+ res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+ val = res[0]
+ for l in range(1, len(self.chns)):
+ val += res[l]
+ return val
+
+
+class ScalingLayer(nn.Module):
+ def __init__(self):
+ super(ScalingLayer, self).__init__()
+ self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+ self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+
+ def forward(self, inp):
+ return (inp - self.shift) / self.scale
+
+
+class NetLinLayer(nn.Module):
+ """ A single linear layer which does a 1x1 conv """
+ def __init__(self, chn_in, chn_out=1, use_dropout=False):
+ super(NetLinLayer, self).__init__()
+ layers = [nn.Dropout(), ] if (use_dropout) else []
+ layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+ self.model = nn.Sequential(*layers)
+
+
+class vgg16(torch.nn.Module):
+ def __init__(self, requires_grad=False, pretrained=False):
+ super(vgg16, self).__init__()
+ vgg16_model = models.vgg16(pretrained=pretrained)
+ vgg16_model.load_state_dict(torch.load(custom_torch_download(filename="vgg16-397923af.pth")), strict=True)
+ vgg_pretrained_features = vgg16_model.features
+ self.slice1 = torch.nn.Sequential()
+ self.slice2 = torch.nn.Sequential()
+ self.slice3 = torch.nn.Sequential()
+ self.slice4 = torch.nn.Sequential()
+ self.slice5 = torch.nn.Sequential()
+ self.N_slices = 5
+ for x in range(4):
+ self.slice1.add_module(str(x), vgg_pretrained_features[x])
+ for x in range(4, 9):
+ self.slice2.add_module(str(x), vgg_pretrained_features[x])
+ for x in range(9, 16):
+ self.slice3.add_module(str(x), vgg_pretrained_features[x])
+ for x in range(16, 23):
+ self.slice4.add_module(str(x), vgg_pretrained_features[x])
+ for x in range(23, 30):
+ self.slice5.add_module(str(x), vgg_pretrained_features[x])
+ if not requires_grad:
+ for param in self.parameters():
+ param.requires_grad = False
+
+ def forward(self, X):
+ h = self.slice1(X)
+ h_relu1_2 = h
+ h = self.slice2(h)
+ h_relu2_2 = h
+ h = self.slice3(h)
+ h_relu3_3 = h
+ h = self.slice4(h)
+ h_relu4_3 = h
+ h = self.slice5(h)
+ h_relu5_3 = h
+ vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+ out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+ return out
+
+
+def normalize_tensor(x,eps=1e-10):
+ norm_factor = torch.sqrt(torch.sum(x**2,dim=1,keepdim=True))
+ return x/(norm_factor+eps)
+
+
+def spatial_average(x, keepdim=True):
+ return x.mean([2,3],keepdim=keepdim)
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/segmentation.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..033b73eb50a5662f810f5f013212855da311a3d9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/segmentation.py
@@ -0,0 +1,22 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BCELoss(nn.Module):
+ def forward(self, prediction, target):
+ loss = F.binary_cross_entropy_with_logits(prediction,target)
+ return loss, {}
+
+
+class BCELossWithQuant(nn.Module):
+ def __init__(self, codebook_weight=1.):
+ super().__init__()
+ self.codebook_weight = codebook_weight
+
+ def forward(self, qloss, target, prediction, split):
+ bce_loss = F.binary_cross_entropy_with_logits(prediction,target)
+ loss = bce_loss + self.codebook_weight*qloss
+ return loss, {"{}/total_loss".format(split): loss.clone().detach().mean(),
+ "{}/bce_loss".format(split): bce_loss.detach().mean(),
+ "{}/quant_loss".format(split): qloss.detach().mean()
+ }
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e3a0b782018d38ac5064bad4af3ce4d1024a67
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/util.py
@@ -0,0 +1,157 @@
+import os, hashlib
+import requests
+from tqdm import tqdm
+
+URL_MAP = {
+ "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+
+CKPT_MAP = {
+ "vgg_lpips": "vgg.pth"
+}
+
+MD5_MAP = {
+ "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+
+
+def download(url, local_path, chunk_size=1024):
+ os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+ with requests.get(url, stream=True) as r:
+ total_size = int(r.headers.get("content-length", 0))
+ with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+ with open(local_path, "wb") as f:
+ for data in r.iter_content(chunk_size=chunk_size):
+ if data:
+ f.write(data)
+ pbar.update(chunk_size)
+
+
+def md5_hash(path):
+ with open(path, "rb") as f:
+ content = f.read()
+ return hashlib.md5(content).hexdigest()
+
+
+def get_ckpt_path(name, root, check=False):
+ assert name in URL_MAP
+ path = os.path.join(root, CKPT_MAP[name])
+ if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+ print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+ download(URL_MAP[name], path)
+ md5 = md5_hash(path)
+ assert md5 == MD5_MAP[name], md5
+ return path
+
+
+class KeyNotFoundError(Exception):
+ def __init__(self, cause, keys=None, visited=None):
+ self.cause = cause
+ self.keys = keys
+ self.visited = visited
+ messages = list()
+ if keys is not None:
+ messages.append("Key not found: {}".format(keys))
+ if visited is not None:
+ messages.append("Visited: {}".format(visited))
+ messages.append("Cause:\n{}".format(cause))
+ message = "\n".join(messages)
+ super().__init__(message)
+
+
+def retrieve(
+ list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+ """Given a nested list or dict return the desired value at key expanding
+ callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+ is done in-place.
+
+ Parameters
+ ----------
+ list_or_dict : list or dict
+ Possibly nested list or dictionary.
+ key : str
+ key/to/value, path like string describing all keys necessary to
+ consider to get to the desired value. List indices can also be
+ passed here.
+ splitval : str
+ String that defines the delimiter between keys of the
+ different depth levels in `key`.
+ default : obj
+ Value returned if :attr:`key` is not found.
+ expand : bool
+ Whether to expand callable nodes on the path or not.
+
+ Returns
+ -------
+ The desired value or if :attr:`default` is not ``None`` and the
+ :attr:`key` is not found returns ``default``.
+
+ Raises
+ ------
+ Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+ ``None``.
+ """
+
+ keys = key.split(splitval)
+
+ success = True
+ try:
+ visited = []
+ parent = None
+ last_key = None
+ for key in keys:
+ if callable(list_or_dict):
+ if not expand:
+ raise KeyNotFoundError(
+ ValueError(
+ "Trying to get past callable node with expand=False."
+ ),
+ keys=keys,
+ visited=visited,
+ )
+ list_or_dict = list_or_dict()
+ parent[last_key] = list_or_dict
+
+ last_key = key
+ parent = list_or_dict
+
+ try:
+ if isinstance(list_or_dict, dict):
+ list_or_dict = list_or_dict[key]
+ else:
+ list_or_dict = list_or_dict[int(key)]
+ except (KeyError, IndexError, ValueError) as e:
+ raise KeyNotFoundError(e, keys=keys, visited=visited)
+
+ visited += [key]
+ # final expansion of retrieved value
+ if expand and callable(list_or_dict):
+ list_or_dict = list_or_dict()
+ parent[last_key] = list_or_dict
+ except KeyNotFoundError as e:
+ if default is None:
+ raise e
+ else:
+ list_or_dict = default
+ success = False
+
+ if not pass_success:
+ return list_or_dict
+ else:
+ return list_or_dict, success
+
+
+if __name__ == "__main__":
+ config = {"keya": "a",
+ "keyb": "b",
+ "keyc":
+ {"cc1": 1,
+ "cc2": 2,
+ }
+ }
+ from omegaconf import OmegaConf
+ config = OmegaConf.create(config)
+ print(config)
+ retrieve(config, "keya")
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/vqperceptual.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/vqperceptual.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c25b8a19f85f4d7af222386561210fc74e739f0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/losses/vqperceptual.py
@@ -0,0 +1,136 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_controlnet_aux.diffusion_edge.taming.modules.losses.lpips import LPIPS
+from custom_controlnet_aux.diffusion_edge.taming.modules.discriminator.model import NLayerDiscriminator, weights_init, NLayerDiscriminator2
+
+
+class DummyLoss(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+
+def adopt_weight(weight, global_step, threshold=0, value=0.):
+ if global_step < threshold:
+ weight = value
+ return weight
+
+
+def hinge_d_loss(logits_real, logits_fake):
+ loss_real = torch.mean(F.relu(1. - logits_real))
+ loss_fake = torch.mean(F.relu(1. + logits_fake))
+ d_loss = 0.5 * (loss_real + loss_fake)
+ return d_loss
+
+
+def vanilla_d_loss(logits_real, logits_fake):
+ d_loss = 0.5 * (
+ torch.mean(torch.nn.functional.softplus(-logits_real)) +
+ torch.mean(torch.nn.functional.softplus(logits_fake)))
+ return d_loss
+
+
+class VQLPIPSWithDiscriminator(nn.Module):
+ def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
+ disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+ perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+ disc_ndf=64, disc_loss="hinge"):
+ super().__init__()
+ assert disc_loss in ["hinge", "vanilla"]
+ self.codebook_weight = codebook_weight
+ self.pixel_weight = pixelloss_weight
+ self.perceptual_loss = LPIPS().eval()
+ self.perceptual_weight = perceptual_weight
+
+ self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
+ n_layers=disc_num_layers,
+ use_actnorm=use_actnorm,
+ ndf=disc_ndf
+ ).apply(weights_init)
+ self.discriminator_iter_start = disc_start
+ if disc_loss == "hinge":
+ self.disc_loss = hinge_d_loss
+ elif disc_loss == "vanilla":
+ self.disc_loss = vanilla_d_loss
+ else:
+ raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
+ print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
+ self.disc_factor = disc_factor
+ self.discriminator_weight = disc_weight
+ self.disc_conditional = disc_conditional
+
+ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+ if last_layer is not None:
+ nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+ g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+ else:
+ nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+ g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+
+ d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+ d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+ d_weight = d_weight * self.discriminator_weight
+ return d_weight
+
+ def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
+ global_step, last_layer=None, cond=None, split="train"):
+ rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+ if self.perceptual_weight > 0:
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
+ else:
+ p_loss = torch.tensor([0.0])
+
+ nll_loss = rec_loss
+ #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+ nll_loss = torch.mean(nll_loss)
+
+ # now the GAN part
+ if optimizer_idx == 0:
+ # generator update
+ if cond is None:
+ assert not self.disc_conditional
+ logits_fake = self.discriminator(reconstructions.contiguous())
+ else:
+ assert self.disc_conditional
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+ g_loss = -torch.mean(logits_fake)
+
+ try:
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+ except RuntimeError:
+ assert not self.training
+ d_weight = torch.tensor(0.0)
+
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+ loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
+
+ log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
+ "{}/quant_loss".format(split): codebook_loss.detach().mean(),
+ "{}/nll_loss".format(split): nll_loss.detach().mean(),
+ "{}/rec_loss".format(split): rec_loss.detach().mean(),
+ "{}/p_loss".format(split): p_loss.detach().mean(),
+ "{}/d_weight".format(split): d_weight.detach(),
+ "{}/disc_factor".format(split): torch.tensor(disc_factor),
+ "{}/g_loss".format(split): g_loss.detach().mean(),
+ }
+ return loss, log
+
+ if optimizer_idx == 1:
+ # second pass for discriminator update
+ if cond is None:
+ logits_real = self.discriminator(inputs.contiguous().detach())
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
+ else:
+ logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+ d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+
+ log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+ "{}/logits_real".format(split): logits_real.detach().mean(),
+ "{}/logits_fake".format(split): logits_fake.detach().mean()
+ }
+ return d_loss, log
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/misc/coord.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/misc/coord.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4d544ca234b463e3d0a2a75e88449828cabd7f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/misc/coord.py
@@ -0,0 +1,31 @@
+import torch
+
+class CoordStage(object):
+ def __init__(self, n_embed, down_factor):
+ self.n_embed = n_embed
+ self.down_factor = down_factor
+
+ def eval(self):
+ return self
+
+ def encode(self, c):
+ """fake vqmodel interface"""
+ assert 0.0 <= c.min() and c.max() <= 1.0
+ b,ch,h,w = c.shape
+ assert ch == 1
+
+ c = torch.nn.functional.interpolate(c, scale_factor=1/self.down_factor,
+ mode="area")
+ c = c.clamp(0.0, 1.0)
+ c = self.n_embed*c
+ c_quant = c.round()
+ c_ind = c_quant.to(dtype=torch.long)
+
+ info = None, None, c_ind
+ return c_quant, None, info
+
+ def decode(self, c):
+ c = c/self.n_embed
+ c = torch.nn.functional.interpolate(c, scale_factor=self.down_factor,
+ mode="nearest")
+ return c
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..49786bb6830d56da3e0a9344a5a4312aa0c438f3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/util.py
@@ -0,0 +1,130 @@
+import torch
+import torch.nn as nn
+
+
+def count_params(model):
+ total_params = sum(p.numel() for p in model.parameters())
+ return total_params
+
+
+class ActNorm(nn.Module):
+ def __init__(self, num_features, logdet=False, affine=True,
+ allow_reverse_init=False):
+ assert affine
+ super().__init__()
+ self.logdet = logdet
+ self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+ self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+ self.allow_reverse_init = allow_reverse_init
+
+ self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+
+ def initialize(self, input):
+ with torch.no_grad():
+ flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+ mean = (
+ flatten.mean(1)
+ .unsqueeze(1)
+ .unsqueeze(2)
+ .unsqueeze(3)
+ .permute(1, 0, 2, 3)
+ )
+ std = (
+ flatten.std(1)
+ .unsqueeze(1)
+ .unsqueeze(2)
+ .unsqueeze(3)
+ .permute(1, 0, 2, 3)
+ )
+
+ self.loc.data.copy_(-mean)
+ self.scale.data.copy_(1 / (std + 1e-6))
+
+ def forward(self, input, reverse=False):
+ if reverse:
+ return self.reverse(input)
+ if len(input.shape) == 2:
+ input = input[:,:,None,None]
+ squeeze = True
+ else:
+ squeeze = False
+
+ _, _, height, width = input.shape
+
+ if self.training and self.initialized.item() == 0:
+ self.initialize(input)
+ self.initialized.fill_(1)
+
+ h = self.scale * (input + self.loc)
+
+ if squeeze:
+ h = h.squeeze(-1).squeeze(-1)
+
+ if self.logdet:
+ log_abs = torch.log(torch.abs(self.scale))
+ logdet = height*width*torch.sum(log_abs)
+ logdet = logdet * torch.ones(input.shape[0]).to(input)
+ return h, logdet
+
+ return h
+
+ def reverse(self, output):
+ if self.training and self.initialized.item() == 0:
+ if not self.allow_reverse_init:
+ raise RuntimeError(
+ "Initializing ActNorm in reverse direction is "
+ "disabled by default. Use allow_reverse_init=True to enable."
+ )
+ else:
+ self.initialize(output)
+ self.initialized.fill_(1)
+
+ if len(output.shape) == 2:
+ output = output[:,:,None,None]
+ squeeze = True
+ else:
+ squeeze = False
+
+ h = output / self.scale - self.loc
+
+ if squeeze:
+ h = h.squeeze(-1).squeeze(-1)
+ return h
+
+
+class AbstractEncoder(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def encode(self, *args, **kwargs):
+ raise NotImplementedError
+
+
+class Labelator(AbstractEncoder):
+ """Net2Net Interface for Class-Conditional Model"""
+ def __init__(self, n_classes, quantize_interface=True):
+ super().__init__()
+ self.n_classes = n_classes
+ self.quantize_interface = quantize_interface
+
+ def encode(self, c):
+ c = c[:,None]
+ if self.quantize_interface:
+ return c, None, [None, None, c.long()]
+ return c
+
+
+class SOSProvider(AbstractEncoder):
+ # for unconditional training
+ def __init__(self, sos_token, quantize_interface=True):
+ super().__init__()
+ self.sos_token = sos_token
+ self.quantize_interface = quantize_interface
+
+ def encode(self, x):
+ # get batch size from data and replicate sos_token
+ c = torch.ones(x.shape[0], 1)*self.sos_token
+ c = c.long().to(x.device)
+ if self.quantize_interface:
+ return c, None, [None, None, c]
+ return c
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/vqvae/quantize.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/vqvae/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d0b96a06a4789a7e2e2efd9aa0de4a584f231b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/modules/vqvae/quantize.py
@@ -0,0 +1,445 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch import einsum
+from einops import rearrange
+
+
+class VectorQuantizer(nn.Module):
+ """
+ see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
+ ____________________________________________
+ Discretization bottleneck part of the VQ-VAE.
+ Inputs:
+ - n_e : number of embeddings
+ - e_dim : dimension of embedding
+ - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
+ _____________________________________________
+ """
+
+ # NOTE: this class contains a bug regarding beta; see VectorQuantizer2 for
+ # a fix and use legacy=False to apply that fix. VectorQuantizer2 can be
+ # used wherever VectorQuantizer has been used before and is additionally
+ # more efficient.
+ def __init__(self, n_e, e_dim, beta):
+ super(VectorQuantizer, self).__init__()
+ self.n_e = n_e
+ self.e_dim = e_dim
+ self.beta = beta
+
+ self.embedding = nn.Embedding(self.n_e, self.e_dim)
+ self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+ def forward(self, z):
+ """
+ Inputs the output of the encoder network z and maps it to a discrete
+ one-hot vector that is the index of the closest embedding vector e_j
+ z (continuous) -> z_q (discrete)
+ z.shape = (batch, channel, height, width)
+ quantization pipeline:
+ 1. get encoder input (B,C,H,W)
+ 2. flatten input to (B*H*W,C)
+ """
+ # reshape z -> (batch, height, width, channel) and flatten
+ z = z.permute(0, 2, 3, 1).contiguous()
+ z_flattened = z.view(-1, self.e_dim)
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+ d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+ torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+ torch.matmul(z_flattened, self.embedding.weight.t())
+
+ ## could possible replace this here
+ # #\start...
+ # find closest encodings
+ min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
+
+ min_encodings = torch.zeros(
+ min_encoding_indices.shape[0], self.n_e).to(z)
+ min_encodings.scatter_(1, min_encoding_indices, 1)
+
+ # dtype min encodings: torch.float32
+ # min_encodings shape: torch.Size([2048, 512])
+ # min_encoding_indices.shape: torch.Size([2048, 1])
+
+ # get quantized latent vectors
+ z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
+ #.........\end
+
+ # with:
+ # .........\start
+ #min_encoding_indices = torch.argmin(d, dim=1)
+ #z_q = self.embedding(min_encoding_indices)
+ # ......\end......... (TODO)
+
+ # compute loss for embedding
+ loss = torch.mean((z_q.detach()-z)**2) + self.beta * \
+ torch.mean((z_q - z.detach()) ** 2)
+
+ # preserve gradients
+ z_q = z + (z_q - z).detach()
+
+ # perplexity
+ e_mean = torch.mean(min_encodings, dim=0)
+ perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
+
+ # reshape back to match original input shape
+ z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+ return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+
+ def get_codebook_entry(self, indices, shape):
+ # shape specifying (batch, height, width, channel)
+ # TODO: check for more easy handling with nn.Embedding
+ min_encodings = torch.zeros(indices.shape[0], self.n_e).to(indices)
+ min_encodings.scatter_(1, indices[:,None], 1)
+
+ # get quantized latent vectors
+ z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
+
+ if shape is not None:
+ z_q = z_q.view(shape)
+
+ # reshape back to match original input shape
+ z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+ return z_q
+
+
+class GumbelQuantize(nn.Module):
+ """
+ credit to @karpathy: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py (thanks!)
+ Gumbel Softmax trick quantizer
+ Categorical Reparameterization with Gumbel-Softmax, Jang et al. 2016
+ https://arxiv.org/abs/1611.01144
+ """
+ def __init__(self, num_hiddens, embedding_dim, n_embed, straight_through=True,
+ kl_weight=5e-4, temp_init=1.0, use_vqinterface=True,
+ remap=None, unknown_index="random"):
+ super().__init__()
+
+ self.embedding_dim = embedding_dim
+ self.n_embed = n_embed
+
+ self.straight_through = straight_through
+ self.temperature = temp_init
+ self.kl_weight = kl_weight
+
+ self.proj = nn.Conv2d(num_hiddens, n_embed, 1)
+ self.embed = nn.Embedding(n_embed, embedding_dim)
+
+ self.use_vqinterface = use_vqinterface
+
+ self.remap = remap
+ if self.remap is not None:
+ self.register_buffer("used", torch.tensor(np.load(self.remap)))
+ self.re_embed = self.used.shape[0]
+ self.unknown_index = unknown_index # "random" or "extra" or integer
+ if self.unknown_index == "extra":
+ self.unknown_index = self.re_embed
+ self.re_embed = self.re_embed+1
+ print(f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
+ f"Using {self.unknown_index} for unknown indices.")
+ else:
+ self.re_embed = n_embed
+
+ def remap_to_used(self, inds):
+ ishape = inds.shape
+ assert len(ishape)>1
+ inds = inds.reshape(ishape[0],-1)
+ used = self.used.to(inds)
+ match = (inds[:,:,None]==used[None,None,...]).long()
+ new = match.argmax(-1)
+ unknown = match.sum(2)<1
+ if self.unknown_index == "random":
+ new[unknown]=torch.randint(0,self.re_embed,size=new[unknown].shape).to(device=new.device)
+ else:
+ new[unknown] = self.unknown_index
+ return new.reshape(ishape)
+
+ def unmap_to_all(self, inds):
+ ishape = inds.shape
+ assert len(ishape)>1
+ inds = inds.reshape(ishape[0],-1)
+ used = self.used.to(inds)
+ if self.re_embed > self.used.shape[0]: # extra token
+ inds[inds>=self.used.shape[0]] = 0 # simply set to zero
+ back=torch.gather(used[None,:][inds.shape[0]*[0],:], 1, inds)
+ return back.reshape(ishape)
+
+ def forward(self, z, temp=None, return_logits=False):
+ # force hard = True when we are in eval mode, as we must quantize. actually, always true seems to work
+ hard = self.straight_through if self.training else True
+ temp = self.temperature if temp is None else temp
+
+ logits = self.proj(z)
+ if self.remap is not None:
+ # continue only with used logits
+ full_zeros = torch.zeros_like(logits)
+ logits = logits[:,self.used,...]
+
+ soft_one_hot = F.gumbel_softmax(logits, tau=temp, dim=1, hard=hard)
+ if self.remap is not None:
+ # go back to all entries but unused set to zero
+ full_zeros[:,self.used,...] = soft_one_hot
+ soft_one_hot = full_zeros
+ z_q = einsum('b n h w, n d -> b d h w', soft_one_hot, self.embed.weight)
+
+ # + kl divergence to the prior loss
+ qy = F.softmax(logits, dim=1)
+ diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.n_embed + 1e-10), dim=1).mean()
+
+ ind = soft_one_hot.argmax(dim=1)
+ if self.remap is not None:
+ ind = self.remap_to_used(ind)
+ if self.use_vqinterface:
+ if return_logits:
+ return z_q, diff, (None, None, ind), logits
+ return z_q, diff, (None, None, ind)
+ return z_q, diff, ind
+
+ def get_codebook_entry(self, indices, shape):
+ b, h, w, c = shape
+ assert b*h*w == indices.shape[0]
+ indices = rearrange(indices, '(b h w) -> b h w', b=b, h=h, w=w)
+ if self.remap is not None:
+ indices = self.unmap_to_all(indices)
+ one_hot = F.one_hot(indices, num_classes=self.n_embed).permute(0, 3, 1, 2).float()
+ z_q = einsum('b n h w, n d -> b d h w', one_hot, self.embed.weight)
+ return z_q
+
+
+class VectorQuantizer2(nn.Module):
+ """
+ Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
+ avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+ """
+ # NOTE: due to a bug the beta term was applied to the wrong term. for
+ # backwards compatibility we use the buggy version by default, but you can
+ # specify legacy=False to fix it.
+ def __init__(self, n_e, e_dim, beta, remap=None, unknown_index="random",
+ sane_index_shape=False, legacy=True):
+ super().__init__()
+ self.n_e = n_e
+ self.e_dim = e_dim
+ self.beta = beta
+ self.legacy = legacy
+
+ self.embedding = nn.Embedding(self.n_e, self.e_dim)
+ self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+ self.remap = remap
+ if self.remap is not None:
+ self.register_buffer("used", torch.tensor(np.load(self.remap)))
+ self.re_embed = self.used.shape[0]
+ self.unknown_index = unknown_index # "random" or "extra" or integer
+ if self.unknown_index == "extra":
+ self.unknown_index = self.re_embed
+ self.re_embed = self.re_embed+1
+ print(f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+ f"Using {self.unknown_index} for unknown indices.")
+ else:
+ self.re_embed = n_e
+
+ self.sane_index_shape = sane_index_shape
+
+ def remap_to_used(self, inds):
+ ishape = inds.shape
+ assert len(ishape)>1
+ inds = inds.reshape(ishape[0],-1)
+ used = self.used.to(inds)
+ match = (inds[:,:,None]==used[None,None,...]).long()
+ new = match.argmax(-1)
+ unknown = match.sum(2)<1
+ if self.unknown_index == "random":
+ new[unknown]=torch.randint(0,self.re_embed,size=new[unknown].shape).to(device=new.device)
+ else:
+ new[unknown] = self.unknown_index
+ return new.reshape(ishape)
+
+ def unmap_to_all(self, inds):
+ ishape = inds.shape
+ assert len(ishape)>1
+ inds = inds.reshape(ishape[0],-1)
+ used = self.used.to(inds)
+ if self.re_embed > self.used.shape[0]: # extra token
+ inds[inds>=self.used.shape[0]] = 0 # simply set to zero
+ back=torch.gather(used[None,:][inds.shape[0]*[0],:], 1, inds)
+ return back.reshape(ishape)
+
+ def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
+ assert temp is None or temp==1.0, "Only for interface compatible with Gumbel"
+ assert rescale_logits==False, "Only for interface compatible with Gumbel"
+ assert return_logits==False, "Only for interface compatible with Gumbel"
+ # reshape z -> (batch, height, width, channel) and flatten
+ z = rearrange(z, 'b c h w -> b h w c').contiguous()
+ z_flattened = z.view(-1, self.e_dim)
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+ d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+ torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+ torch.einsum('bd,dn->bn', z_flattened, rearrange(self.embedding.weight, 'n d -> d n'))
+
+ min_encoding_indices = torch.argmin(d, dim=1)
+ z_q = self.embedding(min_encoding_indices).view(z.shape)
+ perplexity = None
+ min_encodings = None
+
+ # compute loss for embedding
+ if not self.legacy:
+ loss = self.beta * torch.mean((z_q.detach()-z)**2) + \
+ torch.mean((z_q - z.detach()) ** 2)
+ else:
+ loss = torch.mean((z_q.detach()-z)**2) + self.beta * \
+ torch.mean((z_q - z.detach()) ** 2)
+
+ # preserve gradients
+ z_q = z + (z_q - z).detach()
+
+ # reshape back to match original input shape
+ z_q = rearrange(z_q, 'b h w c -> b c h w').contiguous()
+
+ if self.remap is not None:
+ min_encoding_indices = min_encoding_indices.reshape(z.shape[0],-1) # add batch axis
+ min_encoding_indices = self.remap_to_used(min_encoding_indices)
+ min_encoding_indices = min_encoding_indices.reshape(-1,1) # flatten
+
+ if self.sane_index_shape:
+ min_encoding_indices = min_encoding_indices.reshape(
+ z_q.shape[0], z_q.shape[2], z_q.shape[3])
+
+ return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+
+ def get_codebook_entry(self, indices, shape):
+ # shape specifying (batch, height, width, channel)
+ if self.remap is not None:
+ indices = indices.reshape(shape[0],-1) # add batch axis
+ indices = self.unmap_to_all(indices)
+ indices = indices.reshape(-1) # flatten again
+
+ # get quantized latent vectors
+ z_q = self.embedding(indices)
+
+ if shape is not None:
+ z_q = z_q.view(shape)
+ # reshape back to match original input shape
+ z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+ return z_q
+
+class EmbeddingEMA(nn.Module):
+ def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5):
+ super().__init__()
+ self.decay = decay
+ self.eps = eps
+ weight = torch.randn(num_tokens, codebook_dim)
+ self.weight = nn.Parameter(weight, requires_grad = False)
+ self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad = False)
+ self.embed_avg = nn.Parameter(weight.clone(), requires_grad = False)
+ self.update = True
+
+ def forward(self, embed_id):
+ return F.embedding(embed_id, self.weight)
+
+ def cluster_size_ema_update(self, new_cluster_size):
+ self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay)
+
+ def embed_avg_ema_update(self, new_embed_avg):
+ self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay)
+
+ def weight_update(self, num_tokens):
+ n = self.cluster_size.sum()
+ smoothed_cluster_size = (
+ (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n
+ )
+ #normalize embedding average with smoothed cluster size
+ embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
+ self.weight.data.copy_(embed_normalized)
+
+
+class EMAVectorQuantizer(nn.Module):
+ def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5,
+ remap=None, unknown_index="random"):
+ super().__init__()
+ self.codebook_dim = codebook_dim
+ self.num_tokens = num_tokens
+ self.beta = beta
+ self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps)
+
+ self.remap = remap
+ if self.remap is not None:
+ self.register_buffer("used", torch.tensor(np.load(self.remap)))
+ self.re_embed = self.used.shape[0]
+ self.unknown_index = unknown_index # "random" or "extra" or integer
+ if self.unknown_index == "extra":
+ self.unknown_index = self.re_embed
+ self.re_embed = self.re_embed+1
+ print(f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
+ f"Using {self.unknown_index} for unknown indices.")
+ else:
+ self.re_embed = n_embed
+
+ def remap_to_used(self, inds):
+ ishape = inds.shape
+ assert len(ishape)>1
+ inds = inds.reshape(ishape[0],-1)
+ used = self.used.to(inds)
+ match = (inds[:,:,None]==used[None,None,...]).long()
+ new = match.argmax(-1)
+ unknown = match.sum(2)<1
+ if self.unknown_index == "random":
+ new[unknown]=torch.randint(0,self.re_embed,size=new[unknown].shape).to(device=new.device)
+ else:
+ new[unknown] = self.unknown_index
+ return new.reshape(ishape)
+
+ def unmap_to_all(self, inds):
+ ishape = inds.shape
+ assert len(ishape)>1
+ inds = inds.reshape(ishape[0],-1)
+ used = self.used.to(inds)
+ if self.re_embed > self.used.shape[0]: # extra token
+ inds[inds>=self.used.shape[0]] = 0 # simply set to zero
+ back=torch.gather(used[None,:][inds.shape[0]*[0],:], 1, inds)
+ return back.reshape(ishape)
+
+ def forward(self, z):
+ # reshape z -> (batch, height, width, channel) and flatten
+ #z, 'b c h w -> b h w c'
+ z = rearrange(z, 'b c h w -> b h w c')
+ z_flattened = z.reshape(-1, self.codebook_dim)
+
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+ d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \
+ self.embedding.weight.pow(2).sum(dim=1) - 2 * \
+ torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n'
+
+
+ encoding_indices = torch.argmin(d, dim=1)
+
+ z_q = self.embedding(encoding_indices).view(z.shape)
+ encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
+ avg_probs = torch.mean(encodings, dim=0)
+ perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+
+ if self.training and self.embedding.update:
+ #EMA cluster size
+ encodings_sum = encodings.sum(0)
+ self.embedding.cluster_size_ema_update(encodings_sum)
+ #EMA embedding average
+ embed_sum = encodings.transpose(0,1) @ z_flattened
+ self.embedding.embed_avg_ema_update(embed_sum)
+ #normalize embed_avg and update weight
+ self.embedding.weight_update(self.num_tokens)
+
+ # compute loss for embedding
+ loss = self.beta * F.mse_loss(z_q.detach(), z)
+
+ # preserve gradients
+ z_q = z + (z_q - z).detach()
+
+ # reshape back to match original input shape
+ #z_q, 'b h w c -> b c h w'
+ z_q = rearrange(z_q, 'b h w c -> b c h w')
+ return z_q, loss, (perplexity, encodings, encoding_indices)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e3a0b782018d38ac5064bad4af3ce4d1024a67
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/diffusion_edge/taming/util.py
@@ -0,0 +1,157 @@
+import os, hashlib
+import requests
+from tqdm import tqdm
+
+URL_MAP = {
+ "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+
+CKPT_MAP = {
+ "vgg_lpips": "vgg.pth"
+}
+
+MD5_MAP = {
+ "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+
+
+def download(url, local_path, chunk_size=1024):
+ os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+ with requests.get(url, stream=True) as r:
+ total_size = int(r.headers.get("content-length", 0))
+ with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+ with open(local_path, "wb") as f:
+ for data in r.iter_content(chunk_size=chunk_size):
+ if data:
+ f.write(data)
+ pbar.update(chunk_size)
+
+
+def md5_hash(path):
+ with open(path, "rb") as f:
+ content = f.read()
+ return hashlib.md5(content).hexdigest()
+
+
+def get_ckpt_path(name, root, check=False):
+ assert name in URL_MAP
+ path = os.path.join(root, CKPT_MAP[name])
+ if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+ print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+ download(URL_MAP[name], path)
+ md5 = md5_hash(path)
+ assert md5 == MD5_MAP[name], md5
+ return path
+
+
+class KeyNotFoundError(Exception):
+ def __init__(self, cause, keys=None, visited=None):
+ self.cause = cause
+ self.keys = keys
+ self.visited = visited
+ messages = list()
+ if keys is not None:
+ messages.append("Key not found: {}".format(keys))
+ if visited is not None:
+ messages.append("Visited: {}".format(visited))
+ messages.append("Cause:\n{}".format(cause))
+ message = "\n".join(messages)
+ super().__init__(message)
+
+
+def retrieve(
+ list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+ """Given a nested list or dict return the desired value at key expanding
+ callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+ is done in-place.
+
+ Parameters
+ ----------
+ list_or_dict : list or dict
+ Possibly nested list or dictionary.
+ key : str
+ key/to/value, path like string describing all keys necessary to
+ consider to get to the desired value. List indices can also be
+ passed here.
+ splitval : str
+ String that defines the delimiter between keys of the
+ different depth levels in `key`.
+ default : obj
+ Value returned if :attr:`key` is not found.
+ expand : bool
+ Whether to expand callable nodes on the path or not.
+
+ Returns
+ -------
+ The desired value or if :attr:`default` is not ``None`` and the
+ :attr:`key` is not found returns ``default``.
+
+ Raises
+ ------
+ Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+ ``None``.
+ """
+
+ keys = key.split(splitval)
+
+ success = True
+ try:
+ visited = []
+ parent = None
+ last_key = None
+ for key in keys:
+ if callable(list_or_dict):
+ if not expand:
+ raise KeyNotFoundError(
+ ValueError(
+ "Trying to get past callable node with expand=False."
+ ),
+ keys=keys,
+ visited=visited,
+ )
+ list_or_dict = list_or_dict()
+ parent[last_key] = list_or_dict
+
+ last_key = key
+ parent = list_or_dict
+
+ try:
+ if isinstance(list_or_dict, dict):
+ list_or_dict = list_or_dict[key]
+ else:
+ list_or_dict = list_or_dict[int(key)]
+ except (KeyError, IndexError, ValueError) as e:
+ raise KeyNotFoundError(e, keys=keys, visited=visited)
+
+ visited += [key]
+ # final expansion of retrieved value
+ if expand and callable(list_or_dict):
+ list_or_dict = list_or_dict()
+ parent[last_key] = list_or_dict
+ except KeyNotFoundError as e:
+ if default is None:
+ raise e
+ else:
+ list_or_dict = default
+ success = False
+
+ if not pass_success:
+ return list_or_dict
+ else:
+ return list_or_dict, success
+
+
+if __name__ == "__main__":
+ config = {"keya": "a",
+ "keyb": "b",
+ "keyc":
+ {"cc1": 1,
+ "cc2": 2,
+ }
+ }
+ from omegaconf import OmegaConf
+ config = OmegaConf.create(config)
+ print(config)
+ retrieve(config, "keya")
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..4f364f4b07c7dc45666dc77809008c5951a56f96
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/LICENSE
@@ -0,0 +1,230 @@
+DSINE SOFTWARE
+
+LICENCE AGREEMENT
+
+WE (Imperial College of Science, Technology and Medicine, (“Imperial College
+London”)) ARE WILLING TO LICENSE THIS SOFTWARE TO YOU (a licensee “You”) ONLY
+ON THE CONDITION THAT YOU ACCEPT ALL OF THE TERMS CONTAINED IN THE FOLLOWING
+AGREEMENT. PLEASE READ THE AGREEMENT CAREFULLY BEFORE DOWNLOADING THE SOFTWARE.
+BY EXERCISING THE OPTION TO DOWNLOAD THE SOFTWARE YOU AGREE TO BE BOUND BY THE
+TERMS OF THE AGREEMENT.
+
+SOFTWARE LICENCE AGREEMENT (EXCLUDING BSD COMPONENTS)
+
+1. This Agreement pertains to a worldwide, non-exclusive, temporary, fully
+paid-up, royalty free, non-transferable, non-sub- licensable licence (the
+“Licence”) to use the elastic fusion source code, including any modification,
+part or derivative (the “Software”).
+
+Ownership and Licence. Your rights to use and download the Software onto your
+computer, and all other copies that You are authorised to make, are specified
+in this Agreement. However, we (or our licensors) retain all rights, including
+but not limited to all copyright and other intellectual property rights
+anywhere in the world, in the Software not expressly granted to You in this
+Agreement.
+
+2. Permitted use of the Licence:
+
+(a) You may download and install the Software onto one computer or server for
+use in accordance with Clause 2(b) of this Agreement provided that You ensure
+that the Software is not accessible by other users unless they have themselves
+accepted the terms of this licence agreement.
+
+(b) You may use the Software solely for non-commercial, internal or academic
+research purposes and only in accordance with the terms of this Agreement. You
+may not use the Software for commercial purposes, including but not limited to
+(1) integration of all or part of the source code or the Software into a
+product for sale or licence by or on behalf of You to third parties or (2) use
+of the Software or any derivative of it for research to develop software
+products for sale or licence to a third party or (3) use of the Software or any
+derivative of it for research to develop non-software products for sale or
+licence to a third party, or (4) use of the Software to provide any service to
+an external organisation for which payment is received.
+
+Should You wish to use the Software for commercial purposes, You shall
+email researchcontracts.engineering@imperial.ac.uk .
+
+(c) Right to Copy. You may copy the Software for back-up and archival purposes,
+provided that each copy is kept in your possession and provided You reproduce
+our copyright notice (set out in Schedule 1) on each copy.
+
+(d) Transfer and sub-licensing. You may not rent, lend, or lease the Software
+and You may not transmit, transfer or sub-license this licence to use the
+Software or any of your rights or obligations under this Agreement to another
+party.
+
+(e) Identity of Licensee. The licence granted herein is personal to You. You
+shall not permit any third party to access, modify or otherwise use the
+Software nor shall You access modify or otherwise use the Software on behalf of
+any third party. If You wish to obtain a licence for mutiple users or a site
+licence for the Software please contact us
+at researchcontracts.engineering@imperial.ac.uk .
+
+(f) Publications and presentations. You may make public, results or data
+obtained from, dependent on or arising from research carried out using the
+Software, provided that any such presentation or publication identifies the
+Software as the source of the results or the data, including the Copyright
+Notice given in each element of the Software, and stating that the Software has
+been made available for use by You under licence from Imperial College London
+and You provide a copy of any such publication to Imperial College London.
+
+3. Prohibited Uses. You may not, without written permission from us
+at researchcontracts.engineering@imperial.ac.uk :
+
+(a) Use, copy, modify, merge, or transfer copies of the Software or any
+documentation provided by us which relates to the Software except as provided
+in this Agreement;
+
+(b) Use any back-up or archival copies of the Software (or allow anyone else to
+use such copies) for any purpose other than to replace the original copy in the
+event it is destroyed or becomes defective; or
+
+(c) Disassemble, decompile or "unlock", reverse translate, or in any manner
+decode the Software for any reason.
+
+4. Warranty Disclaimer
+
+(a) Disclaimer. The Software has been developed for research purposes only. You
+acknowledge that we are providing the Software to You under this licence
+agreement free of charge and on condition that the disclaimer set out below
+shall apply. We do not represent or warrant that the Software as to: (i) the
+quality, accuracy or reliability of the Software; (ii) the suitability of the
+Software for any particular use or for use under any specific conditions; and
+(iii) whether use of the Software will infringe third-party rights.
+
+You acknowledge that You have reviewed and evaluated the Software to determine
+that it meets your needs and that You assume all responsibility and liability
+for determining the suitability of the Software as fit for your particular
+purposes and requirements. Subject to Clause 4(b), we exclude and expressly
+disclaim all express and implied representations, warranties, conditions and
+terms not stated herein (including the implied conditions or warranties of
+satisfactory quality, merchantable quality, merchantability and fitness for
+purpose).
+
+(b) Savings. Some jurisdictions may imply warranties, conditions or terms or
+impose obligations upon us which cannot, in whole or in part, be excluded,
+restricted or modified or otherwise do not allow the exclusion of implied
+warranties, conditions or terms, in which case the above warranty disclaimer
+and exclusion will only apply to You to the extent permitted in the relevant
+jurisdiction and does not in any event exclude any implied warranties,
+conditions or terms which may not under applicable law be excluded.
+
+(c) Imperial College London disclaims all responsibility for the use which is
+made of the Software and any liability for the outcomes arising from using the
+Software.
+
+5. Limitation of Liability
+
+(a) You acknowledge that we are providing the Software to You under this
+licence agreement free of charge and on condition that the limitation of
+liability set out below shall apply. Accordingly, subject to Clause 5(b), we
+exclude all liability whether in contract, tort, negligence or otherwise, in
+respect of the Software and/or any related documentation provided to You by us
+including, but not limited to, liability for loss or corruption of data, loss
+of contracts, loss of income, loss of profits, loss of cover and any
+consequential or indirect loss or damage of any kind arising out of or in
+connection with this licence agreement, however caused. This exclusion shall
+apply even if we have been advised of the possibility of such loss or damage.
+
+(b) You agree to indemnify Imperial College London and hold it harmless from
+and against any and all claims, damages and liabilities asserted by third
+parties (including claims for negligence) which arise directly or indirectly
+from the use of the Software or any derivative of it or the sale of any
+products based on the Software. You undertake to make no liability claim
+against any employee, student, agent or appointee of Imperial College London,
+in connection with this Licence or the Software.
+
+(c) Nothing in this Agreement shall have the effect of excluding or limiting
+our statutory liability.
+
+(d) Some jurisdictions do not allow these limitations or exclusions either
+wholly or in part, and, to that extent, they may not apply to you. Nothing in
+this licence agreement will affect your statutory rights or other relevant
+statutory provisions which cannot be excluded, restricted or modified, and its
+terms and conditions must be read and construed subject to any such statutory
+rights and/or provisions.
+
+6. Confidentiality. You agree not to disclose any confidential information
+provided to You by us pursuant to this Agreement to any third party without our
+prior written consent. The obligations in this Clause 6 shall survive the
+termination of this Agreement for any reason.
+
+7. Termination.
+
+(a) We may terminate this licence agreement and your right to use the Software
+at any time with immediate effect upon written notice to You.
+
+(b) This licence agreement and your right to use the Software automatically
+terminate if You:
+
+ (i) fail to comply with any provisions of this Agreement; or
+
+ (ii) destroy the copies of the Software in your possession, or voluntarily
+ return the Software to us.
+
+(c) Upon termination You will destroy all copies of the Software.
+
+(d) Otherwise, the restrictions on your rights to use the Software will expire
+10 (ten) years after first use of the Software under this licence agreement.
+
+8. Miscellaneous Provisions.
+
+(a) This Agreement will be governed by and construed in accordance with the
+substantive laws of England and Wales whose courts shall have exclusive
+jurisdiction over all disputes which may arise between us.
+
+(b) This is the entire agreement between us relating to the Software, and
+supersedes any prior purchase order, communications, advertising or
+representations concerning the Software.
+
+(c) No change or modification of this Agreement will be valid unless it is in
+writing, and is signed by us.
+
+(d) The unenforceability or invalidity of any part of this Agreement will not
+affect the enforceability or validity of the remaining parts.
+
+BSD Elements of the Software
+
+For BSD elements of the Software, the following terms shall apply:
+Copyright as indicated in the header of the individual element of the Software.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SCHEDULE 1
+
+The Software
+
+DSINE is a framework for estimating surface normals from a single image. It is based on the techniques described in the following publication:
+
+ • Gwangbin Bae, Andrew J. Davison. Rethinking Inductive Biases for Surface Normal Estimation. CVPR, 2024
+_________________________
+
+Acknowledgments
+
+If you use the software, you should reference the following paper in any publication:
+
+ • Gwangbin Bae, Andrew J. Davison. Rethinking Inductive Biases for Surface Normal Estimation. CVPR, 2024
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a133ce8d64cd31c4c0e6bf4bc87d7e24534ab1ad
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/__init__.py
@@ -0,0 +1,100 @@
+import os
+import types
+import warnings
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, DIFFUSION_EDGE_MODEL_NAME
+from .models.dsine_arch import DSINE
+from custom_controlnet_aux.dsine.utils.utils import get_intrins_from_fov
+
+# load model
+def load_checkpoint(fpath, model):
+ ckpt = torch.load(fpath, map_location='cpu')['model']
+
+ load_dict = {}
+ for k, v in ckpt.items():
+ if k.startswith('module.'):
+ k_ = k.replace('module.', '')
+ load_dict[k_] = v
+ else:
+ load_dict[k] = v
+
+ model.load_state_dict(load_dict)
+ return model
+
+def get_pad(orig_H, orig_W):
+ if orig_W % 64 == 0:
+ l = 0
+ r = 0
+ else:
+ new_W = 64 * ((orig_W // 64) + 1)
+ l = (new_W - orig_W) // 2
+ r = (new_W - orig_W) - l
+
+ if orig_H % 64 == 0:
+ t = 0
+ b = 0
+ else:
+ new_H = 64 * ((orig_H // 64) + 1)
+ t = (new_H - orig_H) // 2
+ b = (new_H - orig_H) - t
+ return l, r, t, b
+
+class DsineDetector:
+ def __init__(self, model):
+ self.model = model
+ self.norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=DIFFUSION_EDGE_MODEL_NAME, filename="dsine.pt"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ model = DSINE()
+ model = load_checkpoint(model_path, model)
+ model.eval()
+
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.model.pixel_coords = self.model.pixel_coords.to(device)
+ self.device = device
+ return self
+
+
+ def __call__(self, input_image, fov=60.0, iterations=5, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs):
+ self.model.num_iter = iterations
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ orig_H, orig_W = input_image.shape[:2]
+ l, r, t, b = get_pad(orig_H, orig_W)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method, mode="constant")
+ with torch.no_grad():
+ input_image = torch.from_numpy(input_image).float().to(self.device)
+ input_image = input_image / 255.0
+ input_image = rearrange(input_image, 'h w c -> 1 c h w')
+ input_image = self.norm(input_image)
+
+ intrins = get_intrins_from_fov(new_fov=fov, H=orig_H, W=orig_W, device=self.device).unsqueeze(0)
+ intrins[:, 0, 2] += l
+ intrins[:, 1, 2] += t
+
+ normal = self.model(input_image, intrins)
+ normal = normal[-1][0]
+ normal = ((normal + 1) * 0.5).clip(0, 1)
+
+ normal = rearrange(normal, 'c h w -> h w c').cpu().numpy()
+ normal_image = (normal * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = HWC3(normal_image)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/dsine_arch.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/dsine_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f21963e621501da6d850aafb6c58d89d41b8cbe7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/dsine_arch.py
@@ -0,0 +1,232 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from custom_controlnet_aux.dsine.models.submodules import Encoder, ConvGRU, UpSampleBN, UpSampleGN, RayReLU, \
+ convex_upsampling, get_unfold, get_prediction_head, \
+ INPUT_CHANNELS_DICT
+from custom_controlnet_aux.dsine.utils.rotation import axis_angle_to_matrix
+
+
+class Decoder(nn.Module):
+ def __init__(self, output_dims, B=5, NF=2048, BN=False, downsample_ratio=8):
+ super(Decoder, self).__init__()
+ input_channels = INPUT_CHANNELS_DICT[B]
+ output_dim, feature_dim, hidden_dim = output_dims
+ features = bottleneck_features = NF
+ self.downsample_ratio = downsample_ratio
+
+ UpSample = UpSampleBN if BN else UpSampleGN
+ self.conv2 = nn.Conv2d(bottleneck_features + 2, features, kernel_size=1, stride=1, padding=0)
+ self.up1 = UpSample(skip_input=features // 1 + input_channels[1] + 2, output_features=features // 2, align_corners=False)
+ self.up2 = UpSample(skip_input=features // 2 + input_channels[2] + 2, output_features=features // 4, align_corners=False)
+
+ # prediction heads
+ i_dim = features // 4
+ h_dim = 128
+ self.normal_head = get_prediction_head(i_dim+2, h_dim, output_dim)
+ self.feature_head = get_prediction_head(i_dim+2, h_dim, feature_dim)
+ self.hidden_head = get_prediction_head(i_dim+2, h_dim, hidden_dim)
+
+ def forward(self, features, uvs):
+ _, _, x_block2, x_block3, x_block4 = features[4], features[5], features[6], features[8], features[11]
+ uv_32, uv_16, uv_8 = uvs
+
+ x_d0 = self.conv2(torch.cat([x_block4, uv_32], dim=1))
+ x_d1 = self.up1(x_d0, torch.cat([x_block3, uv_16], dim=1))
+ x_feat = self.up2(x_d1, torch.cat([x_block2, uv_8], dim=1))
+ x_feat = torch.cat([x_feat, uv_8], dim=1)
+
+ normal = self.normal_head(x_feat)
+ normal = F.normalize(normal, dim=1)
+ f = self.feature_head(x_feat)
+ h = self.hidden_head(x_feat)
+ return normal, f, h
+
+
+class DSINE(nn.Module):
+ def __init__(self):
+ super(DSINE, self).__init__()
+ self.downsample_ratio = 8
+ self.ps = 5 # patch size
+ self.num_iter = 5 # num iterations
+
+ # define encoder
+ self.encoder = Encoder(B=5, pretrained=True)
+
+ # define decoder
+ self.output_dim = output_dim = 3
+ self.feature_dim = feature_dim = 64
+ self.hidden_dim = hidden_dim = 64
+ self.decoder = Decoder([output_dim, feature_dim, hidden_dim], B=5, NF=2048, BN=False)
+
+ # ray direction-based ReLU
+ self.ray_relu = RayReLU(eps=1e-2)
+
+ # pixel_coords (1, 3, H, W)
+ # NOTE: this is set to some arbitrarily high number,
+ # if your input is 2000+ pixels wide/tall, increase these values
+ h = 2000
+ w = 2000
+ pixel_coords = np.ones((3, h, w)).astype(np.float32)
+ x_range = np.concatenate([np.arange(w).reshape(1, w)] * h, axis=0)
+ y_range = np.concatenate([np.arange(h).reshape(h, 1)] * w, axis=1)
+ pixel_coords[0, :, :] = x_range + 0.5
+ pixel_coords[1, :, :] = y_range + 0.5
+ self.pixel_coords = torch.from_numpy(pixel_coords).unsqueeze(0)
+
+ # define ConvGRU cell
+ self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=feature_dim+2, ks=self.ps)
+
+ # padding used during NRN
+ self.pad = (self.ps - 1) // 2
+
+ # prediction heads
+ self.prob_head = get_prediction_head(self.hidden_dim+2, 64, self.ps*self.ps) # weights assigned for each nghbr pixel
+ self.xy_head = get_prediction_head(self.hidden_dim+2, 64, self.ps*self.ps*2) # rotation axis for each nghbr pixel
+ self.angle_head = get_prediction_head(self.hidden_dim+2, 64, self.ps*self.ps) # rotation angle for each nghbr pixel
+
+ # prediction heads - weights used for upsampling the coarse resolution output
+ self.up_prob_head = get_prediction_head(self.hidden_dim+2, 64, 9 * self.downsample_ratio * self.downsample_ratio)
+
+ def get_ray(self, intrins, H, W, orig_H, orig_W, return_uv=False):
+ B, _, _ = intrins.shape
+ fu = intrins[:, 0, 0][:,None,None] * (W / orig_W)
+ cu = intrins[:, 0, 2][:,None,None] * (W / orig_W)
+ fv = intrins[:, 1, 1][:,None,None] * (H / orig_H)
+ cv = intrins[:, 1, 2][:,None,None] * (H / orig_H)
+
+ # (B, 2, H, W)
+ ray = self.pixel_coords[:, :, :H, :W].repeat(B, 1, 1, 1)
+ ray[:, 0, :, :] = (ray[:, 0, :, :] - cu) / fu
+ ray[:, 1, :, :] = (ray[:, 1, :, :] - cv) / fv
+
+ if return_uv:
+ return ray[:, :2, :, :]
+ else:
+ return F.normalize(ray, dim=1)
+
+ def upsample(self, h, pred_norm, uv_8):
+ up_mask = self.up_prob_head(torch.cat([h, uv_8], dim=1))
+ up_pred_norm = convex_upsampling(pred_norm, up_mask, self.downsample_ratio)
+ up_pred_norm = F.normalize(up_pred_norm, dim=1)
+ return up_pred_norm
+
+ def refine(self, h, feat_map, pred_norm, intrins, orig_H, orig_W, uv_8, ray_8):
+ B, C, H, W = pred_norm.shape
+ fu = intrins[:, 0, 0][:,None,None,None] * (W / orig_W) # (B, 1, 1, 1)
+ cu = intrins[:, 0, 2][:,None,None,None] * (W / orig_W)
+ fv = intrins[:, 1, 1][:,None,None,None] * (H / orig_H)
+ cv = intrins[:, 1, 2][:,None,None,None] * (H / orig_H)
+
+ h_new = self.gru(h, feat_map)
+
+ # get nghbr prob (B, 1, ps*ps, h, w)
+ nghbr_prob = self.prob_head(torch.cat([h_new, uv_8], dim=1)).unsqueeze(1)
+ nghbr_prob = torch.sigmoid(nghbr_prob)
+
+ # get nghbr normals (B, 3, ps*ps, h, w)
+ nghbr_normals = get_unfold(pred_norm, ps=self.ps, pad=self.pad)
+
+ # get nghbr xy (B, 2, ps*ps, h, w)
+ nghbr_xys = self.xy_head(torch.cat([h_new, uv_8], dim=1))
+ nghbr_xs, nghbr_ys = torch.split(nghbr_xys, [self.ps*self.ps, self.ps*self.ps], dim=1)
+ nghbr_xys = torch.cat([nghbr_xs.unsqueeze(1), nghbr_ys.unsqueeze(1)], dim=1)
+ nghbr_xys = F.normalize(nghbr_xys, dim=1)
+
+ # get nghbr theta (B, 1, ps*ps, h, w)
+ nghbr_angle = self.angle_head(torch.cat([h_new, uv_8], dim=1)).unsqueeze(1)
+ nghbr_angle = torch.sigmoid(nghbr_angle) * np.pi
+
+ # get nghbr pixel coord (1, 3, ps*ps, h, w)
+ nghbr_pixel_coord = get_unfold(self.pixel_coords[:, :, :H, :W], ps=self.ps, pad=self.pad)
+
+ # nghbr axes (B, 3, ps*ps, h, w)
+ nghbr_axes = torch.zeros_like(nghbr_normals)
+
+ du_over_fu = nghbr_xys[:, 0, ...] / fu # (B, ps*ps, h, w)
+ dv_over_fv = nghbr_xys[:, 1, ...] / fv # (B, ps*ps, h, w)
+
+ term_u = (nghbr_pixel_coord[:, 0, ...] + nghbr_xys[:, 0, ...] - cu) / fu # (B, ps*ps, h, w)
+ term_v = (nghbr_pixel_coord[:, 1, ...] + nghbr_xys[:, 1, ...] - cv) / fv # (B, ps*ps, h, w)
+
+ nx = nghbr_normals[:, 0, ...] # (B, ps*ps, h, w)
+ ny = nghbr_normals[:, 1, ...] # (B, ps*ps, h, w)
+ nz = nghbr_normals[:, 2, ...] # (B, ps*ps, h, w)
+
+ nghbr_delta_z_num = - (du_over_fu * nx + dv_over_fv * ny)
+ nghbr_delta_z_denom = (term_u * nx + term_v * ny + nz)
+ nghbr_delta_z_denom[torch.abs(nghbr_delta_z_denom) < 1e-8] = 1e-8 * torch.sign(nghbr_delta_z_denom[torch.abs(nghbr_delta_z_denom) < 1e-8])
+ nghbr_delta_z = nghbr_delta_z_num / nghbr_delta_z_denom
+
+ nghbr_axes[:, 0, ...] = du_over_fu + nghbr_delta_z * term_u
+ nghbr_axes[:, 1, ...] = dv_over_fv + nghbr_delta_z * term_v
+ nghbr_axes[:, 2, ...] = nghbr_delta_z
+ nghbr_axes = F.normalize(nghbr_axes, dim=1) # (B, 3, ps*ps, h, w)
+
+ # make sure axes are all valid
+ invalid = torch.sum(torch.logical_or(torch.isnan(nghbr_axes), torch.isinf(nghbr_axes)).float(), dim=1) > 0.5 # (B, ps*ps, h, w)
+ nghbr_axes[:, 0, ...][invalid] = 0.0
+ nghbr_axes[:, 1, ...][invalid] = 0.0
+ nghbr_axes[:, 2, ...][invalid] = 0.0
+
+ # nghbr_axes_angle (B, 3, ps*ps, h, w)
+ nghbr_axes_angle = nghbr_axes * nghbr_angle
+ nghbr_axes_angle = nghbr_axes_angle.permute(0, 2, 3, 4, 1) # (B, ps*ps, h, w, 3)
+ nghbr_R = axis_angle_to_matrix(nghbr_axes_angle) # (B, ps*ps, h, w, 3, 3)
+
+ # (B, 3, ps*ps, h, w)
+ nghbr_normals_rot = torch.bmm(
+ nghbr_R.reshape(B * self.ps * self.ps * H * W, 3, 3),
+ nghbr_normals.permute(0, 2, 3, 4, 1).reshape(B * self.ps * self.ps * H * W, 3).unsqueeze(-1)
+ ).reshape(B, self.ps*self.ps, H, W, 3, 1).squeeze(-1).permute(0, 4, 1, 2, 3) # (B, 3, ps*ps, h, w)
+ nghbr_normals_rot = F.normalize(nghbr_normals_rot, dim=1)
+
+ # ray ReLU
+ nghbr_normals_rot = torch.cat([
+ self.ray_relu(nghbr_normals_rot[:, :, i, :, :], ray_8).unsqueeze(2)
+ for i in range(nghbr_normals_rot.size(2))
+ ], dim=2)
+
+ # (B, 1, ps*ps, h, w) * (B, 3, ps*ps, h, w)
+ pred_norm = torch.sum(nghbr_prob * nghbr_normals_rot, dim=2) # (B, C, H, W)
+ pred_norm = F.normalize(pred_norm, dim=1)
+
+ up_mask = self.up_prob_head(torch.cat([h_new, uv_8], dim=1))
+ up_pred_norm = convex_upsampling(pred_norm, up_mask, self.downsample_ratio)
+ up_pred_norm = F.normalize(up_pred_norm, dim=1)
+
+ return h_new, pred_norm, up_pred_norm
+
+
+ def forward(self, img, intrins=None):
+ # Step 1. encoder
+ features = self.encoder(img)
+
+ # Step 2. get uv encoding
+ B, _, orig_H, orig_W = img.shape
+ intrins[:, 0, 2] += 0.5
+ intrins[:, 1, 2] += 0.5
+ uv_32 = self.get_ray(intrins, orig_H//32, orig_W//32, orig_H, orig_W, return_uv=True)
+ uv_16 = self.get_ray(intrins, orig_H//16, orig_W//16, orig_H, orig_W, return_uv=True)
+ uv_8 = self.get_ray(intrins, orig_H//8, orig_W//8, orig_H, orig_W, return_uv=True)
+ ray_8 = self.get_ray(intrins, orig_H//8, orig_W//8, orig_H, orig_W)
+
+ # Step 3. decoder - initial prediction
+ pred_norm, feat_map, h = self.decoder(features, uvs=(uv_32, uv_16, uv_8))
+ pred_norm = self.ray_relu(pred_norm, ray_8)
+
+ # Step 4. add ray direction encoding
+ feat_map = torch.cat([feat_map, uv_8], dim=1)
+
+ # iterative refinement
+ up_pred_norm = self.upsample(h, pred_norm, uv_8)
+ pred_list = [up_pred_norm]
+ for i in range(self.num_iter):
+ h, pred_norm, up_pred_norm = self.refine(h, feat_map,
+ pred_norm.detach(),
+ intrins, orig_H, orig_W, uv_8, ray_8)
+ pred_list.append(up_pred_norm)
+ return pred_list
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b15b2364fa29f84910bfda02f076831166f83b21
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/__init__.py
@@ -0,0 +1,199 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+
+
+INPUT_CHANNELS_DICT = {
+ 0: [1280, 112, 40, 24, 16],
+ 1: [1280, 112, 40, 24, 16],
+ 2: [1408, 120, 48, 24, 16],
+ 3: [1536, 136, 48, 32, 24],
+ 4: [1792, 160, 56, 32, 24],
+ 5: [2048, 176, 64, 40, 24],
+ 6: [2304, 200, 72, 40, 32],
+ 7: [2560, 224, 80, 48, 32]
+}
+
+
+class Encoder(nn.Module):
+ def __init__(self, B=5, pretrained=True):
+ super(Encoder, self).__init__()
+
+ basemodel_name = 'tf_efficientnet_b%s_ap' % B
+ print('Loading base model ()...'.format(basemodel_name), end='')
+ repo_path = os.path.join(os.path.dirname(__file__), 'efficientnet_repo')
+ basemodel = torch.hub.load(repo_path, basemodel_name, pretrained=False, source='local')
+ print('Done.')
+
+ # Remove last layer
+ print('Removing last two layers (global_pool & classifier).')
+ basemodel.global_pool = nn.Identity()
+ basemodel.classifier = nn.Identity()
+
+ self.original_model = basemodel
+
+ def forward(self, x):
+ features = [x]
+ for k, v in self.original_model._modules.items():
+ if (k == 'blocks'):
+ for ki, vi in v._modules.items():
+ features.append(vi(features[-1]))
+ else:
+ features.append(v(features[-1]))
+ return features
+
+
+class ConvGRU(nn.Module):
+ def __init__(self, hidden_dim, input_dim, ks=3):
+ super(ConvGRU, self).__init__()
+ p = (ks - 1) // 2
+ self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, ks, padding=p)
+ self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, ks, padding=p)
+ self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, ks, padding=p)
+
+ def forward(self, h, x):
+ hx = torch.cat([h, x], dim=1)
+ z = torch.sigmoid(self.convz(hx))
+ r = torch.sigmoid(self.convr(hx))
+ q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)))
+ h = (1-z) * h + z * q
+ return h
+
+
+class RayReLU(nn.Module):
+ def __init__(self, eps=1e-2):
+ super(RayReLU, self).__init__()
+ self.eps = eps
+
+ def forward(self, pred_norm, ray):
+ # angle between the predicted normal and ray direction
+ cos = torch.cosine_similarity(pred_norm, ray, dim=1).unsqueeze(1) # (B, 1, H, W)
+
+ # component of pred_norm along view
+ norm_along_view = ray * cos
+
+ # cos should be bigger than eps
+ norm_along_view_relu = ray * (torch.relu(cos - self.eps) + self.eps)
+
+ # difference
+ diff = norm_along_view_relu - norm_along_view
+
+ # updated pred_norm
+ new_pred_norm = pred_norm + diff
+ new_pred_norm = F.normalize(new_pred_norm, dim=1)
+
+ return new_pred_norm
+
+
+class UpSampleBN(nn.Module):
+ def __init__(self, skip_input, output_features, align_corners=True):
+ super(UpSampleBN, self).__init__()
+ self._net = nn.Sequential(nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+ nn.BatchNorm2d(output_features),
+ nn.LeakyReLU(),
+ nn.Conv2d(output_features, output_features, kernel_size=3, stride=1, padding=1),
+ nn.BatchNorm2d(output_features),
+ nn.LeakyReLU())
+ self.align_corners = align_corners
+
+ def forward(self, x, concat_with):
+ up_x = F.interpolate(x, size=[concat_with.size(2), concat_with.size(3)], mode='bilinear', align_corners=self.align_corners)
+ f = torch.cat([up_x, concat_with], dim=1)
+ return self._net(f)
+
+
+class Conv2d_WS(nn.Conv2d):
+ """ weight standardization
+ """
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+ padding=0, dilation=1, groups=1, bias=True):
+ super(Conv2d_WS, self).__init__(in_channels, out_channels, kernel_size, stride,
+ padding, dilation, groups, bias)
+
+ def forward(self, x):
+ weight = self.weight
+ weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2,
+ keepdim=True).mean(dim=3, keepdim=True)
+ weight = weight - weight_mean
+ std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5
+ weight = weight / std.expand_as(weight)
+ return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class UpSampleGN(nn.Module):
+ """ UpSample with GroupNorm
+ """
+ def __init__(self, skip_input, output_features, align_corners=True):
+ super(UpSampleGN, self).__init__()
+ self._net = nn.Sequential(Conv2d_WS(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+ nn.GroupNorm(8, output_features),
+ nn.LeakyReLU(),
+ Conv2d_WS(output_features, output_features, kernel_size=3, stride=1, padding=1),
+ nn.GroupNorm(8, output_features),
+ nn.LeakyReLU())
+ self.align_corners = align_corners
+
+ def forward(self, x, concat_with):
+ up_x = F.interpolate(x, size=[concat_with.size(2), concat_with.size(3)], mode='bilinear', align_corners=self.align_corners)
+ f = torch.cat([up_x, concat_with], dim=1)
+ return self._net(f)
+
+
+def upsample_via_bilinear(out, up_mask, downsample_ratio):
+ """ bilinear upsampling (up_mask is a dummy variable)
+ """
+ return F.interpolate(out, scale_factor=downsample_ratio, mode='bilinear', align_corners=True)
+
+
+def upsample_via_mask(out, up_mask, downsample_ratio):
+ """ convex upsampling
+ """
+ # out: low-resolution output (B, o_dim, H, W)
+ # up_mask: (B, 9*k*k, H, W)
+ k = downsample_ratio
+
+ N, o_dim, H, W = out.shape
+ up_mask = up_mask.view(N, 1, 9, k, k, H, W)
+ up_mask = torch.softmax(up_mask, dim=2) # (B, 1, 9, k, k, H, W)
+
+ up_out = F.unfold(out, [3, 3], padding=1) # (B, 2, H, W) -> (B, 2 X 3*3, H*W)
+ up_out = up_out.view(N, o_dim, 9, 1, 1, H, W) # (B, 2, 3*3, 1, 1, H, W)
+ up_out = torch.sum(up_mask * up_out, dim=2) # (B, 2, k, k, H, W)
+
+ up_out = up_out.permute(0, 1, 4, 2, 5, 3) # (B, 2, H, k, W, k)
+ return up_out.reshape(N, o_dim, k*H, k*W) # (B, 2, kH, kW)
+
+
+def convex_upsampling(out, up_mask, k):
+ # out: low-resolution output (B, C, H, W)
+ # up_mask: (B, 9*k*k, H, W)
+ B, C, H, W = out.shape
+ up_mask = up_mask.view(B, 1, 9, k, k, H, W)
+ up_mask = torch.softmax(up_mask, dim=2) # (B, 1, 9, k, k, H, W)
+
+ out = F.pad(out, pad=(1,1,1,1), mode='replicate')
+ up_out = F.unfold(out, [3, 3], padding=0) # (B, C, H, W) -> (B, C X 3*3, H*W)
+ up_out = up_out.view(B, C, 9, 1, 1, H, W) # (B, C, 9, 1, 1, H, W)
+
+ up_out = torch.sum(up_mask * up_out, dim=2) # (B, C, k, k, H, W)
+ up_out = up_out.permute(0, 1, 4, 2, 5, 3) # (B, C, H, k, W, k)
+ return up_out.reshape(B, C, k*H, k*W) # (B, C, kH, kW)
+
+
+def get_unfold(pred_norm, ps, pad):
+ B, C, H, W = pred_norm.shape
+ pred_norm = F.pad(pred_norm, pad=(pad,pad,pad,pad), mode='replicate') # (B, C, h, w)
+ pred_norm_unfold = F.unfold(pred_norm, [ps, ps], padding=0) # (B, C X ps*ps, h*w)
+ pred_norm_unfold = pred_norm_unfold.view(B, C, ps*ps, H, W) # (B, C, ps*ps, h, w)
+ return pred_norm_unfold
+
+
+def get_prediction_head(input_dim, hidden_dim, output_dim):
+ return nn.Sequential(
+ nn.Conv2d(input_dim, hidden_dim, 3, padding=1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(hidden_dim, hidden_dim, 1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(hidden_dim, output_dim, 1),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/.gitignore b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b44ec1da9c1b744d54735281b9509ac7aa8cbbcf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/.gitignore
@@ -0,0 +1,109 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# pytorch stuff
+*.pth
+*.onnx
+*.pb
+
+trained_models/
+.fuse_hidden*
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/BENCHMARK.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/BENCHMARK.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0491e2398cbe65b358dcaf7b020d5b599e18d21
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/BENCHMARK.md
@@ -0,0 +1,555 @@
+# Model Performance Benchmarks
+
+All benchmarks run as per:
+
+```
+python onnx_export.py --model mobilenetv3_100 ./mobilenetv3_100.onnx
+python onnx_optimize.py ./mobilenetv3_100.onnx --output mobilenetv3_100-opt.onnx
+python onnx_to_caffe.py ./mobilenetv3_100.onnx --c2-prefix mobilenetv3
+python onnx_to_caffe.py ./mobilenetv3_100-opt.onnx --c2-prefix mobilenetv3-opt
+python caffe2_benchmark.py --c2-init ./mobilenetv3.init.pb --c2-predict ./mobilenetv3.predict.pb
+python caffe2_benchmark.py --c2-init ./mobilenetv3-opt.init.pb --c2-predict ./mobilenetv3-opt.predict.pb
+```
+
+## EfficientNet-B0
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 49.2862. Iters per second: 20.2897
+Time per operator type:
+ 29.7378 ms. 60.5145%. Conv
+ 12.1785 ms. 24.7824%. Sigmoid
+ 3.62811 ms. 7.38297%. SpatialBN
+ 2.98444 ms. 6.07314%. Mul
+ 0.326902 ms. 0.665225%. AveragePool
+ 0.197317 ms. 0.401528%. FC
+ 0.0852877 ms. 0.173555%. Add
+ 0.0032607 ms. 0.00663532%. Squeeze
+ 49.1416 ms in Total
+FLOP per operator type:
+ 0.76907 GFLOP. 95.2696%. Conv
+ 0.0269508 GFLOP. 3.33857%. SpatialBN
+ 0.00846444 GFLOP. 1.04855%. Mul
+ 0.002561 GFLOP. 0.317248%. FC
+ 0.000210112 GFLOP. 0.0260279%. Add
+ 0.807256 GFLOP in Total
+Feature Memory Read per operator type:
+ 58.5253 MB. 43.0891%. Mul
+ 43.2015 MB. 31.807%. Conv
+ 27.2869 MB. 20.0899%. SpatialBN
+ 5.12912 MB. 3.77631%. FC
+ 1.6809 MB. 1.23756%. Add
+ 135.824 MB in Total
+Feature Memory Written per operator type:
+ 33.8578 MB. 38.1965%. Mul
+ 26.9881 MB. 30.4465%. Conv
+ 26.9508 MB. 30.4044%. SpatialBN
+ 0.840448 MB. 0.948147%. Add
+ 0.004 MB. 0.00451258%. FC
+ 88.6412 MB in Total
+Parameter Memory per operator type:
+ 15.8248 MB. 74.9391%. Conv
+ 5.124 MB. 24.265%. FC
+ 0.168064 MB. 0.795877%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 21.1168 MB in Total
+```
+### Optimized
+```
+Main run finished. Milliseconds per iter: 46.0838. Iters per second: 21.6996
+Time per operator type:
+ 29.776 ms. 65.002%. Conv
+ 12.2803 ms. 26.8084%. Sigmoid
+ 3.15073 ms. 6.87815%. Mul
+ 0.328651 ms. 0.717456%. AveragePool
+ 0.186237 ms. 0.406563%. FC
+ 0.0832429 ms. 0.181722%. Add
+ 0.0026184 ms. 0.00571606%. Squeeze
+ 45.8078 ms in Total
+FLOP per operator type:
+ 0.76907 GFLOP. 98.5601%. Conv
+ 0.00846444 GFLOP. 1.08476%. Mul
+ 0.002561 GFLOP. 0.328205%. FC
+ 0.000210112 GFLOP. 0.0269269%. Add
+ 0.780305 GFLOP in Total
+Feature Memory Read per operator type:
+ 58.5253 MB. 53.8803%. Mul
+ 43.2855 MB. 39.8501%. Conv
+ 5.12912 MB. 4.72204%. FC
+ 1.6809 MB. 1.54749%. Add
+ 108.621 MB in Total
+Feature Memory Written per operator type:
+ 33.8578 MB. 54.8834%. Mul
+ 26.9881 MB. 43.7477%. Conv
+ 0.840448 MB. 1.36237%. Add
+ 0.004 MB. 0.00648399%. FC
+ 61.6904 MB in Total
+Parameter Memory per operator type:
+ 15.8248 MB. 75.5403%. Conv
+ 5.124 MB. 24.4597%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 20.9488 MB in Total
+```
+
+## EfficientNet-B1
+### Optimized
+```
+Main run finished. Milliseconds per iter: 71.8102. Iters per second: 13.9256
+Time per operator type:
+ 45.7915 ms. 66.3206%. Conv
+ 17.8718 ms. 25.8841%. Sigmoid
+ 4.44132 ms. 6.43244%. Mul
+ 0.51001 ms. 0.738658%. AveragePool
+ 0.233283 ms. 0.337868%. Add
+ 0.194986 ms. 0.282402%. FC
+ 0.00268255 ms. 0.00388519%. Squeeze
+ 69.0456 ms in Total
+FLOP per operator type:
+ 1.37105 GFLOP. 98.7673%. Conv
+ 0.0138759 GFLOP. 0.99959%. Mul
+ 0.002561 GFLOP. 0.184489%. FC
+ 0.000674432 GFLOP. 0.0485847%. Add
+ 1.38816 GFLOP in Total
+Feature Memory Read per operator type:
+ 94.624 MB. 54.0789%. Mul
+ 69.8255 MB. 39.9062%. Conv
+ 5.39546 MB. 3.08357%. Add
+ 5.12912 MB. 2.93136%. FC
+ 174.974 MB in Total
+Feature Memory Written per operator type:
+ 55.5035 MB. 54.555%. Mul
+ 43.5333 MB. 42.7894%. Conv
+ 2.69773 MB. 2.65163%. Add
+ 0.004 MB. 0.00393165%. FC
+ 101.739 MB in Total
+Parameter Memory per operator type:
+ 25.7479 MB. 83.4024%. Conv
+ 5.124 MB. 16.5976%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 30.8719 MB in Total
+```
+
+## EfficientNet-B2
+### Optimized
+```
+Main run finished. Milliseconds per iter: 92.28. Iters per second: 10.8366
+Time per operator type:
+ 61.4627 ms. 67.5845%. Conv
+ 22.7458 ms. 25.0113%. Sigmoid
+ 5.59931 ms. 6.15701%. Mul
+ 0.642567 ms. 0.706568%. AveragePool
+ 0.272795 ms. 0.299965%. Add
+ 0.216178 ms. 0.237709%. FC
+ 0.00268895 ms. 0.00295677%. Squeeze
+ 90.942 ms in Total
+FLOP per operator type:
+ 1.98431 GFLOP. 98.9343%. Conv
+ 0.0177039 GFLOP. 0.882686%. Mul
+ 0.002817 GFLOP. 0.140451%. FC
+ 0.000853984 GFLOP. 0.0425782%. Add
+ 2.00568 GFLOP in Total
+Feature Memory Read per operator type:
+ 120.609 MB. 54.9637%. Mul
+ 86.3512 MB. 39.3519%. Conv
+ 6.83187 MB. 3.11341%. Add
+ 5.64163 MB. 2.571%. FC
+ 219.433 MB in Total
+Feature Memory Written per operator type:
+ 70.8155 MB. 54.6573%. Mul
+ 55.3273 MB. 42.7031%. Conv
+ 3.41594 MB. 2.63651%. Add
+ 0.004 MB. 0.00308731%. FC
+ 129.563 MB in Total
+Parameter Memory per operator type:
+ 30.4721 MB. 84.3913%. Conv
+ 5.636 MB. 15.6087%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 36.1081 MB in Total
+```
+
+## MixNet-M
+### Optimized
+```
+Main run finished. Milliseconds per iter: 63.1122. Iters per second: 15.8448
+Time per operator type:
+ 48.1139 ms. 75.2052%. Conv
+ 7.1341 ms. 11.1511%. Sigmoid
+ 2.63706 ms. 4.12189%. SpatialBN
+ 1.73186 ms. 2.70701%. Mul
+ 1.38707 ms. 2.16809%. Split
+ 1.29322 ms. 2.02139%. Concat
+ 1.00093 ms. 1.56452%. Relu
+ 0.235309 ms. 0.367803%. Add
+ 0.221579 ms. 0.346343%. FC
+ 0.219315 ms. 0.342803%. AveragePool
+ 0.00250145 ms. 0.00390993%. Squeeze
+ 63.9768 ms in Total
+FLOP per operator type:
+ 0.675273 GFLOP. 95.5827%. Conv
+ 0.0221072 GFLOP. 3.12921%. SpatialBN
+ 0.00538445 GFLOP. 0.762152%. Mul
+ 0.003073 GFLOP. 0.434973%. FC
+ 0.000642488 GFLOP. 0.0909421%. Add
+ 0 GFLOP. 0%. Concat
+ 0 GFLOP. 0%. Relu
+ 0.70648 GFLOP in Total
+Feature Memory Read per operator type:
+ 46.8424 MB. 30.502%. Conv
+ 36.8626 MB. 24.0036%. Mul
+ 22.3152 MB. 14.5309%. SpatialBN
+ 22.1074 MB. 14.3955%. Concat
+ 14.1496 MB. 9.21372%. Relu
+ 6.15414 MB. 4.00735%. FC
+ 5.1399 MB. 3.34692%. Add
+ 153.571 MB in Total
+Feature Memory Written per operator type:
+ 32.7672 MB. 28.4331%. Conv
+ 22.1072 MB. 19.1831%. Concat
+ 22.1072 MB. 19.1831%. SpatialBN
+ 21.5378 MB. 18.689%. Mul
+ 14.1496 MB. 12.2781%. Relu
+ 2.56995 MB. 2.23003%. Add
+ 0.004 MB. 0.00347092%. FC
+ 115.243 MB in Total
+Parameter Memory per operator type:
+ 13.7059 MB. 68.674%. Conv
+ 6.148 MB. 30.8049%. FC
+ 0.104 MB. 0.521097%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Concat
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 19.9579 MB in Total
+```
+
+## TF MobileNet-V3 Large 1.0
+
+### Optimized
+```
+Main run finished. Milliseconds per iter: 22.0495. Iters per second: 45.3525
+Time per operator type:
+ 17.437 ms. 80.0087%. Conv
+ 1.27662 ms. 5.8577%. Add
+ 1.12759 ms. 5.17387%. Div
+ 0.701155 ms. 3.21721%. Mul
+ 0.562654 ms. 2.58171%. Relu
+ 0.431144 ms. 1.97828%. Clip
+ 0.156902 ms. 0.719936%. FC
+ 0.0996858 ms. 0.457402%. AveragePool
+ 0.00112455 ms. 0.00515993%. Flatten
+ 21.7939 ms in Total
+FLOP per operator type:
+ 0.43062 GFLOP. 98.1484%. Conv
+ 0.002561 GFLOP. 0.583713%. FC
+ 0.00210867 GFLOP. 0.480616%. Mul
+ 0.00193868 GFLOP. 0.441871%. Add
+ 0.00151532 GFLOP. 0.345377%. Div
+ 0 GFLOP. 0%. Relu
+ 0.438743 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.7967 MB. 43.9391%. Conv
+ 14.496 MB. 18.3046%. Mul
+ 9.44828 MB. 11.9307%. Add
+ 9.26157 MB. 11.6949%. Relu
+ 6.0614 MB. 7.65395%. Div
+ 5.12912 MB. 6.47673%. FC
+ 79.193 MB in Total
+Feature Memory Written per operator type:
+ 17.6247 MB. 35.8656%. Conv
+ 9.26157 MB. 18.847%. Relu
+ 8.43469 MB. 17.1643%. Mul
+ 7.75472 MB. 15.7806%. Add
+ 6.06128 MB. 12.3345%. Div
+ 0.004 MB. 0.00813985%. FC
+ 49.1409 MB in Total
+Parameter Memory per operator type:
+ 16.6851 MB. 76.5052%. Conv
+ 5.124 MB. 23.4948%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Div
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 21.8091 MB in Total
+```
+
+## MobileNet-V3 (RW)
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 24.8316. Iters per second: 40.2712
+Time per operator type:
+ 15.9266 ms. 69.2624%. Conv
+ 2.36551 ms. 10.2873%. SpatialBN
+ 1.39102 ms. 6.04936%. Add
+ 1.30327 ms. 5.66773%. Div
+ 0.737014 ms. 3.20517%. Mul
+ 0.639697 ms. 2.78195%. Relu
+ 0.375681 ms. 1.63378%. Clip
+ 0.153126 ms. 0.665921%. FC
+ 0.0993787 ms. 0.432184%. AveragePool
+ 0.0032632 ms. 0.0141912%. Squeeze
+ 22.9946 ms in Total
+FLOP per operator type:
+ 0.430616 GFLOP. 94.4041%. Conv
+ 0.0175992 GFLOP. 3.85829%. SpatialBN
+ 0.002561 GFLOP. 0.561449%. FC
+ 0.00210961 GFLOP. 0.46249%. Mul
+ 0.00173891 GFLOP. 0.381223%. Add
+ 0.00151626 GFLOP. 0.33241%. Div
+ 0 GFLOP. 0%. Relu
+ 0.456141 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.7354 MB. 36.4363%. Conv
+ 17.7944 MB. 18.6658%. SpatialBN
+ 14.5035 MB. 15.2137%. Mul
+ 9.25778 MB. 9.71113%. Relu
+ 7.84641 MB. 8.23064%. Add
+ 6.06516 MB. 6.36216%. Div
+ 5.12912 MB. 5.38029%. FC
+ 95.3317 MB in Total
+Feature Memory Written per operator type:
+ 17.6246 MB. 26.7264%. Conv
+ 17.5992 MB. 26.6878%. SpatialBN
+ 9.25778 MB. 14.0387%. Relu
+ 8.43843 MB. 12.7962%. Mul
+ 6.95565 MB. 10.5477%. Add
+ 6.06502 MB. 9.19713%. Div
+ 0.004 MB. 0.00606568%. FC
+ 65.9447 MB in Total
+Parameter Memory per operator type:
+ 16.6778 MB. 76.1564%. Conv
+ 5.124 MB. 23.3979%. FC
+ 0.0976 MB. 0.445674%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Div
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 21.8994 MB in Total
+
+```
+### Optimized
+
+```
+Main run finished. Milliseconds per iter: 22.0981. Iters per second: 45.2527
+Time per operator type:
+ 17.146 ms. 78.8965%. Conv
+ 1.38453 ms. 6.37084%. Add
+ 1.30991 ms. 6.02749%. Div
+ 0.685417 ms. 3.15391%. Mul
+ 0.532589 ms. 2.45068%. Relu
+ 0.418263 ms. 1.92461%. Clip
+ 0.15128 ms. 0.696106%. FC
+ 0.102065 ms. 0.469648%. AveragePool
+ 0.0022143 ms. 0.010189%. Squeeze
+ 21.7323 ms in Total
+FLOP per operator type:
+ 0.430616 GFLOP. 98.1927%. Conv
+ 0.002561 GFLOP. 0.583981%. FC
+ 0.00210961 GFLOP. 0.481051%. Mul
+ 0.00173891 GFLOP. 0.396522%. Add
+ 0.00151626 GFLOP. 0.34575%. Div
+ 0 GFLOP. 0%. Relu
+ 0.438542 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.7842 MB. 44.833%. Conv
+ 14.5035 MB. 18.6934%. Mul
+ 9.25778 MB. 11.9323%. Relu
+ 7.84641 MB. 10.1132%. Add
+ 6.06516 MB. 7.81733%. Div
+ 5.12912 MB. 6.61087%. FC
+ 77.5861 MB in Total
+Feature Memory Written per operator type:
+ 17.6246 MB. 36.4556%. Conv
+ 9.25778 MB. 19.1492%. Relu
+ 8.43843 MB. 17.4544%. Mul
+ 6.95565 MB. 14.3874%. Add
+ 6.06502 MB. 12.5452%. Div
+ 0.004 MB. 0.00827378%. FC
+ 48.3455 MB in Total
+Parameter Memory per operator type:
+ 16.6778 MB. 76.4973%. Conv
+ 5.124 MB. 23.5027%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Div
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 21.8018 MB in Total
+
+```
+
+## MnasNet-A1
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 30.0892. Iters per second: 33.2345
+Time per operator type:
+ 24.4656 ms. 79.0905%. Conv
+ 4.14958 ms. 13.4144%. SpatialBN
+ 1.60598 ms. 5.19169%. Relu
+ 0.295219 ms. 0.95436%. Mul
+ 0.187609 ms. 0.606486%. FC
+ 0.120556 ms. 0.389724%. AveragePool
+ 0.09036 ms. 0.292109%. Add
+ 0.015727 ms. 0.050841%. Sigmoid
+ 0.00306205 ms. 0.00989875%. Squeeze
+ 30.9337 ms in Total
+FLOP per operator type:
+ 0.620598 GFLOP. 95.6434%. Conv
+ 0.0248873 GFLOP. 3.8355%. SpatialBN
+ 0.002561 GFLOP. 0.394688%. FC
+ 0.000597408 GFLOP. 0.0920695%. Mul
+ 0.000222656 GFLOP. 0.0343146%. Add
+ 0 GFLOP. 0%. Relu
+ 0.648867 GFLOP in Total
+Feature Memory Read per operator type:
+ 35.5457 MB. 38.4109%. Conv
+ 25.1552 MB. 27.1829%. SpatialBN
+ 22.5235 MB. 24.339%. Relu
+ 5.12912 MB. 5.54256%. FC
+ 2.40586 MB. 2.59978%. Mul
+ 1.78125 MB. 1.92483%. Add
+ 92.5406 MB in Total
+Feature Memory Written per operator type:
+ 24.9042 MB. 32.9424%. Conv
+ 24.8873 MB. 32.92%. SpatialBN
+ 22.5235 MB. 29.7932%. Relu
+ 2.38963 MB. 3.16092%. Mul
+ 0.890624 MB. 1.17809%. Add
+ 0.004 MB. 0.00529106%. FC
+ 75.5993 MB in Total
+Parameter Memory per operator type:
+ 10.2732 MB. 66.1459%. Conv
+ 5.124 MB. 32.9917%. FC
+ 0.133952 MB. 0.86247%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 15.5312 MB in Total
+```
+
+### Optimized
+```
+Main run finished. Milliseconds per iter: 24.2367. Iters per second: 41.2597
+Time per operator type:
+ 22.0547 ms. 91.1375%. Conv
+ 1.49096 ms. 6.16116%. Relu
+ 0.253417 ms. 1.0472%. Mul
+ 0.18506 ms. 0.76473%. FC
+ 0.112942 ms. 0.466717%. AveragePool
+ 0.086769 ms. 0.358559%. Add
+ 0.0127889 ms. 0.0528479%. Sigmoid
+ 0.0027346 ms. 0.0113003%. Squeeze
+ 24.1994 ms in Total
+FLOP per operator type:
+ 0.620598 GFLOP. 99.4581%. Conv
+ 0.002561 GFLOP. 0.41043%. FC
+ 0.000597408 GFLOP. 0.0957417%. Mul
+ 0.000222656 GFLOP. 0.0356832%. Add
+ 0 GFLOP. 0%. Relu
+ 0.623979 GFLOP in Total
+Feature Memory Read per operator type:
+ 35.6127 MB. 52.7968%. Conv
+ 22.5235 MB. 33.3917%. Relu
+ 5.12912 MB. 7.60406%. FC
+ 2.40586 MB. 3.56675%. Mul
+ 1.78125 MB. 2.64075%. Add
+ 67.4524 MB in Total
+Feature Memory Written per operator type:
+ 24.9042 MB. 49.1092%. Conv
+ 22.5235 MB. 44.4145%. Relu
+ 2.38963 MB. 4.71216%. Mul
+ 0.890624 MB. 1.75624%. Add
+ 0.004 MB. 0.00788768%. FC
+ 50.712 MB in Total
+Parameter Memory per operator type:
+ 10.2732 MB. 66.7213%. Conv
+ 5.124 MB. 33.2787%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 15.3972 MB in Total
+```
+## MnasNet-B1
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 28.3109. Iters per second: 35.322
+Time per operator type:
+ 29.1121 ms. 83.3081%. Conv
+ 4.14959 ms. 11.8746%. SpatialBN
+ 1.35823 ms. 3.88675%. Relu
+ 0.186188 ms. 0.532802%. FC
+ 0.116244 ms. 0.332647%. Add
+ 0.018641 ms. 0.0533437%. AveragePool
+ 0.0040904 ms. 0.0117052%. Squeeze
+ 34.9451 ms in Total
+FLOP per operator type:
+ 0.626272 GFLOP. 96.2088%. Conv
+ 0.0218266 GFLOP. 3.35303%. SpatialBN
+ 0.002561 GFLOP. 0.393424%. FC
+ 0.000291648 GFLOP. 0.0448034%. Add
+ 0 GFLOP. 0%. Relu
+ 0.650951 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.4354 MB. 41.3788%. Conv
+ 22.1299 MB. 26.5921%. SpatialBN
+ 19.1923 MB. 23.0622%. Relu
+ 5.12912 MB. 6.16333%. FC
+ 2.33318 MB. 2.80364%. Add
+ 83.2199 MB in Total
+Feature Memory Written per operator type:
+ 21.8266 MB. 34.0955%. Conv
+ 21.8266 MB. 34.0955%. SpatialBN
+ 19.1923 MB. 29.9805%. Relu
+ 1.16659 MB. 1.82234%. Add
+ 0.004 MB. 0.00624844%. FC
+ 64.016 MB in Total
+Parameter Memory per operator type:
+ 12.2576 MB. 69.9104%. Conv
+ 5.124 MB. 29.2245%. FC
+ 0.15168 MB. 0.865099%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Relu
+ 17.5332 MB in Total
+```
+
+### Optimized
+```
+Main run finished. Milliseconds per iter: 26.6364. Iters per second: 37.5426
+Time per operator type:
+ 24.9888 ms. 94.0962%. Conv
+ 1.26147 ms. 4.75011%. Relu
+ 0.176234 ms. 0.663619%. FC
+ 0.113309 ms. 0.426672%. Add
+ 0.0138708 ms. 0.0522311%. AveragePool
+ 0.00295685 ms. 0.0111341%. Squeeze
+ 26.5566 ms in Total
+FLOP per operator type:
+ 0.626272 GFLOP. 99.5466%. Conv
+ 0.002561 GFLOP. 0.407074%. FC
+ 0.000291648 GFLOP. 0.0463578%. Add
+ 0 GFLOP. 0%. Relu
+ 0.629124 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.5112 MB. 56.4224%. Conv
+ 19.1923 MB. 31.3775%. Relu
+ 5.12912 MB. 8.3856%. FC
+ 2.33318 MB. 3.81452%. Add
+ 61.1658 MB in Total
+Feature Memory Written per operator type:
+ 21.8266 MB. 51.7346%. Conv
+ 19.1923 MB. 45.4908%. Relu
+ 1.16659 MB. 2.76513%. Add
+ 0.004 MB. 0.00948104%. FC
+ 42.1895 MB in Total
+Parameter Memory per operator type:
+ 12.2576 MB. 70.5205%. Conv
+ 5.124 MB. 29.4795%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Relu
+ 17.3816 MB in Total
+```
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9bd196e16a8e7775f480c8a1c0f5d035f87bbc22
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2020 Ross Wightman
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/README.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..df80c08487a078f40387e0af8633b65ee2af2738
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/README.md
@@ -0,0 +1,323 @@
+# (Generic) EfficientNets for PyTorch
+
+A 'generic' implementation of EfficientNet, MixNet, MobileNetV3, etc. that covers most of the compute/parameter efficient architectures derived from the MobileNet V1/V2 block sequence, including those found via automated neural architecture search.
+
+All models are implemented by GenEfficientNet or MobileNetV3 classes, with string based architecture definitions to configure the block layouts (idea from [here](https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py))
+
+## What's New
+
+### Aug 19, 2020
+* Add updated PyTorch trained EfficientNet-B3 weights trained by myself with `timm` (82.1 top-1)
+* Add PyTorch trained EfficientNet-Lite0 contributed by [@hal-314](https://github.com/hal-314) (75.5 top-1)
+* Update ONNX and Caffe2 export / utility scripts to work with latest PyTorch / ONNX
+* ONNX runtime based validation script added
+* activations (mostly) brought in sync with `timm` equivalents
+
+
+### April 5, 2020
+* Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite
+ * 3.5M param MobileNet-V2 100 @ 73%
+ * 4.5M param MobileNet-V2 110d @ 75%
+ * 6.1M param MobileNet-V2 140 @ 76.5%
+ * 5.8M param MobileNet-V2 120d @ 77.3%
+
+### March 23, 2020
+ * Add EfficientNet-Lite models w/ weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite)
+ * Add PyTorch trained MobileNet-V3 Large weights with 75.77% top-1
+ * IMPORTANT CHANGE (if training from scratch) - weight init changed to better match Tensorflow impl, set `fix_group_fanout=False` in `initialize_weight_goog` for old behavior
+
+### Feb 12, 2020
+ * Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)
+ * Port new EfficientNet-B8 (RandAugment) weights from TF TPU, these are different than the B8 AdvProp, different input normalization.
+ * Add RandAugment PyTorch trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by [Andrew Lavin](https://github.com/andravin)
+
+### Jan 22, 2020
+ * Update weights for EfficientNet B0, B2, B3 and MixNet-XL with latest RandAugment trained weights. Trained with (https://github.com/rwightman/pytorch-image-models)
+ * Fix torchscript compatibility for PyTorch 1.4, add torchscript support for MixedConv2d using ModuleDict
+ * Test models, torchscript, onnx export with PyTorch 1.4 -- no issues
+
+### Nov 22, 2019
+ * New top-1 high! Ported official TF EfficientNet AdvProp (https://arxiv.org/abs/1911.09665) weights and B8 model spec. Created a new set of `ap` models since they use a different
+ preprocessing (Inception mean/std) from the original EfficientNet base/AA/RA weights.
+
+### Nov 15, 2019
+ * Ported official TF MobileNet-V3 float32 large/small/minimalistic weights
+ * Modifications to MobileNet-V3 model and components to support some additional config needed for differences between TF MobileNet-V3 and mine
+
+### Oct 30, 2019
+ * Many of the models will now work with torch.jit.script, MixNet being the biggest exception
+ * Improved interface for enabling torchscript or ONNX export compatible modes (via config)
+ * Add JIT optimized mem-efficient Swish/Mish autograd.fn in addition to memory-efficient autgrad.fn
+ * Activation factory to select best version of activation by name or override one globally
+ * Add pretrained checkpoint load helper that handles input conv and classifier changes
+
+### Oct 27, 2019
+ * Add CondConv EfficientNet variants ported from https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
+ * Add RandAug weights for TF EfficientNet B5 and B7 from https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet
+ * Bring over MixNet-XL model and depth scaling algo from my pytorch-image-models code base
+ * Switch activations and global pooling to modules
+ * Add memory-efficient Swish/Mish impl
+ * Add as_sequential() method to all models and allow as an argument in entrypoint fns
+ * Move MobileNetV3 into own file since it has a different head
+ * Remove ChamNet, MobileNet V2/V1 since they will likely never be used here
+
+## Models
+
+Implemented models include:
+ * EfficientNet NoisyStudent (B0-B7, L2) (https://arxiv.org/abs/1911.04252)
+ * EfficientNet AdvProp (B0-B8) (https://arxiv.org/abs/1911.09665)
+ * EfficientNet (B0-B8) (https://arxiv.org/abs/1905.11946)
+ * EfficientNet-EdgeTPU (S, M, L) (https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html)
+ * EfficientNet-CondConv (https://arxiv.org/abs/1904.04971)
+ * EfficientNet-Lite (https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite)
+ * MixNet (https://arxiv.org/abs/1907.09595)
+ * MNASNet B1, A1 (Squeeze-Excite), and Small (https://arxiv.org/abs/1807.11626)
+ * MobileNet-V3 (https://arxiv.org/abs/1905.02244)
+ * FBNet-C (https://arxiv.org/abs/1812.03443)
+ * Single-Path NAS (https://arxiv.org/abs/1904.02877)
+
+I originally implemented and trained some these models with code [here](https://github.com/rwightman/pytorch-image-models), this repository contains just the GenEfficientNet models, validation, and associated ONNX/Caffe2 export code.
+
+## Pretrained
+
+I've managed to train several of the models to accuracies close to or above the originating papers and official impl. My training code is here: https://github.com/rwightman/pytorch-image-models
+
+
+|Model | Prec@1 (Err) | Prec@5 (Err) | Param#(M) | MAdds(M) | Image Scaling | Resolution | Crop |
+|---|---|---|---|---|---|---|---|
+| efficientnet_b3 | 82.240 (17.760) | 96.116 (3.884) | 12.23 | TBD | bicubic | 320 | 1.0 |
+| efficientnet_b3 | 82.076 (17.924) | 96.020 (3.980) | 12.23 | TBD | bicubic | 300 | 0.904 |
+| mixnet_xl | 81.074 (18.926) | 95.282 (4.718) | 11.90 | TBD | bicubic | 256 | 1.0 |
+| efficientnet_b2 | 80.612 (19.388) | 95.318 (4.682) | 9.1 | TBD | bicubic | 288 | 1.0 |
+| mixnet_xl | 80.476 (19.524) | 94.936 (5.064) | 11.90 | TBD | bicubic | 224 | 0.875 |
+| efficientnet_b2 | 80.288 (19.712) | 95.166 (4.834) | 9.1 | 1003 | bicubic | 260 | 0.890 |
+| mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33 | TBD | bicubic | 224 | 0.875 |
+| efficientnet_b1 | 78.692 (21.308) | 94.086 (5.914) | 7.8 | 694 | bicubic | 240 | 0.882 |
+| efficientnet_es | 78.066 (21.934) | 93.926 (6.074) | 5.44 | TBD | bicubic | 224 | 0.875 |
+| efficientnet_b0 | 77.698 (22.302) | 93.532 (6.468) | 5.3 | 390 | bicubic | 224 | 0.875 |
+| mobilenetv2_120d | 77.294 (22.706 | 93.502 (6.498) | 5.8 | TBD | bicubic | 224 | 0.875 |
+| mixnet_m | 77.256 (22.744) | 93.418 (6.582) | 5.01 | 353 | bicubic | 224 | 0.875 |
+| mobilenetv2_140 | 76.524 (23.476) | 92.990 (7.010) | 6.1 | TBD | bicubic | 224 | 0.875 |
+| mixnet_s | 75.988 (24.012) | 92.794 (7.206) | 4.13 | TBD | bicubic | 224 | 0.875 |
+| mobilenetv3_large_100 | 75.766 (24.234) | 92.542 (7.458) | 5.5 | TBD | bicubic | 224 | 0.875 |
+| mobilenetv3_rw | 75.634 (24.366) | 92.708 (7.292) | 5.5 | 219 | bicubic | 224 | 0.875 |
+| efficientnet_lite0 | 75.472 (24.528) | 92.520 (7.480) | 4.65 | TBD | bicubic | 224 | 0.875 |
+| mnasnet_a1 | 75.448 (24.552) | 92.604 (7.396) | 3.9 | 312 | bicubic | 224 | 0.875 |
+| fbnetc_100 | 75.124 (24.876) | 92.386 (7.614) | 5.6 | 385 | bilinear | 224 | 0.875 |
+| mobilenetv2_110d | 75.052 (24.948) | 92.180 (7.820) | 4.5 | TBD | bicubic | 224 | 0.875 |
+| mnasnet_b1 | 74.658 (25.342) | 92.114 (7.886) | 4.4 | 315 | bicubic | 224 | 0.875 |
+| spnasnet_100 | 74.084 (25.916) | 91.818 (8.182) | 4.4 | TBD | bilinear | 224 | 0.875 |
+| mobilenetv2_100 | 72.978 (27.022) | 91.016 (8.984) | 3.5 | TBD | bicubic | 224 | 0.875 |
+
+
+More pretrained models to come...
+
+
+## Ported Weights
+
+The weights ported from Tensorflow checkpoints for the EfficientNet models do pretty much match accuracy in Tensorflow once a SAME convolution padding equivalent is added, and the same crop factors, image scaling, etc (see table) are used via cmd line args.
+
+**IMPORTANT:**
+* Tensorflow ported weights for EfficientNet AdvProp (AP), EfficientNet EdgeTPU, EfficientNet-CondConv, EfficientNet-Lite, and MobileNet-V3 models use Inception style (0.5, 0.5, 0.5) for mean and std.
+* Enabling the Tensorflow preprocessing pipeline with `--tf-preprocessing` at validation time will improve scores by 0.1-0.5%, very close to original TF impl.
+
+To run validation for tf_efficientnet_b5:
+`python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b5 -b 64 --img-size 456 --crop-pct 0.934 --interpolation bicubic`
+
+To run validation w/ TF preprocessing for tf_efficientnet_b5:
+`python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b5 -b 64 --img-size 456 --tf-preprocessing`
+
+To run validation for a model with Inception preprocessing, ie EfficientNet-B8 AdvProp:
+`python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b8_ap -b 48 --num-gpu 2 --img-size 672 --crop-pct 0.954 --mean 0.5 --std 0.5`
+
+|Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling | Image Size | Crop |
+|---|---|---|---|---|---|---|
+| tf_efficientnet_l2_ns *tfp | 88.352 (11.648) | 98.652 (1.348) | 480 | bicubic | 800 | N/A |
+| tf_efficientnet_l2_ns | TBD | TBD | 480 | bicubic | 800 | 0.961 |
+| tf_efficientnet_l2_ns_475 | 88.234 (11.766) | 98.546 (1.454) | 480 | bicubic | 475 | 0.936 |
+| tf_efficientnet_l2_ns_475 *tfp | 88.172 (11.828) | 98.566 (1.434) | 480 | bicubic | 475 | N/A |
+| tf_efficientnet_b7_ns *tfp | 86.844 (13.156) | 98.084 (1.916) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b7_ns | 86.840 (13.160) | 98.094 (1.906) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b6_ns | 86.452 (13.548) | 97.882 (2.118) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b6_ns *tfp | 86.444 (13.556) | 97.880 (2.120) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b5_ns *tfp | 86.064 (13.936) | 97.746 (2.254) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b5_ns | 86.088 (13.912) | 97.752 (2.248) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b8_ap *tfp | 85.436 (14.564) | 97.272 (2.728) | 87.4 | bicubic | 672 | N/A |
+| tf_efficientnet_b8 *tfp | 85.384 (14.616) | 97.394 (2.606) | 87.4 | bicubic | 672 | N/A |
+| tf_efficientnet_b8 | 85.370 (14.630) | 97.390 (2.610) | 87.4 | bicubic | 672 | 0.954 |
+| tf_efficientnet_b8_ap | 85.368 (14.632) | 97.294 (2.706) | 87.4 | bicubic | 672 | 0.954 |
+| tf_efficientnet_b4_ns *tfp | 85.298 (14.702) | 97.504 (2.496) | 19.34 | bicubic | 380 | N/A |
+| tf_efficientnet_b4_ns | 85.162 (14.838) | 97.470 (2.530) | 19.34 | bicubic | 380 | 0.922 |
+| tf_efficientnet_b7_ap *tfp | 85.154 (14.846) | 97.244 (2.756) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b7_ap | 85.118 (14.882) | 97.252 (2.748) | 66.35 | bicubic | 600 | 0.949 |
+| tf_efficientnet_b7 *tfp | 84.940 (15.060) | 97.214 (2.786) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b7 | 84.932 (15.068) | 97.208 (2.792) | 66.35 | bicubic | 600 | 0.949 |
+| tf_efficientnet_b6_ap | 84.786 (15.214) | 97.138 (2.862) | 43.04 | bicubic | 528 | 0.942 |
+| tf_efficientnet_b6_ap *tfp | 84.760 (15.240) | 97.124 (2.876) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b5_ap *tfp | 84.276 (15.724) | 96.932 (3.068) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b5_ap | 84.254 (15.746) | 96.976 (3.024) | 30.39 | bicubic | 456 | 0.934 |
+| tf_efficientnet_b6 *tfp | 84.140 (15.860) | 96.852 (3.148) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b6 | 84.110 (15.890) | 96.886 (3.114) | 43.04 | bicubic | 528 | 0.942 |
+| tf_efficientnet_b3_ns *tfp | 84.054 (15.946) | 96.918 (3.082) | 12.23 | bicubic | 300 | N/A |
+| tf_efficientnet_b3_ns | 84.048 (15.952) | 96.910 (3.090) | 12.23 | bicubic | 300 | .904 |
+| tf_efficientnet_b5 *tfp | 83.822 (16.178) | 96.756 (3.244) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b5 | 83.812 (16.188) | 96.748 (3.252) | 30.39 | bicubic | 456 | 0.934 |
+| tf_efficientnet_b4_ap *tfp | 83.278 (16.722) | 96.376 (3.624) | 19.34 | bicubic | 380 | N/A |
+| tf_efficientnet_b4_ap | 83.248 (16.752) | 96.388 (3.612) | 19.34 | bicubic | 380 | 0.922 |
+| tf_efficientnet_b4 | 83.022 (16.978) | 96.300 (3.700) | 19.34 | bicubic | 380 | 0.922 |
+| tf_efficientnet_b4 *tfp | 82.948 (17.052) | 96.308 (3.692) | 19.34 | bicubic | 380 | N/A |
+| tf_efficientnet_b2_ns *tfp | 82.436 (17.564) | 96.268 (3.732) | 9.11 | bicubic | 260 | N/A |
+| tf_efficientnet_b2_ns | 82.380 (17.620) | 96.248 (3.752) | 9.11 | bicubic | 260 | 0.89 |
+| tf_efficientnet_b3_ap *tfp | 81.882 (18.118) | 95.662 (4.338) | 12.23 | bicubic | 300 | N/A |
+| tf_efficientnet_b3_ap | 81.828 (18.172) | 95.624 (4.376) | 12.23 | bicubic | 300 | 0.904 |
+| tf_efficientnet_b3 | 81.636 (18.364) | 95.718 (4.282) | 12.23 | bicubic | 300 | 0.904 |
+| tf_efficientnet_b3 *tfp | 81.576 (18.424) | 95.662 (4.338) | 12.23 | bicubic | 300 | N/A |
+| tf_efficientnet_lite4 | 81.528 (18.472) | 95.668 (4.332) | 13.00 | bilinear | 380 | 0.92 |
+| tf_efficientnet_b1_ns *tfp | 81.514 (18.486) | 95.776 (4.224) | 7.79 | bicubic | 240 | N/A |
+| tf_efficientnet_lite4 *tfp | 81.502 (18.498) | 95.676 (4.324) | 13.00 | bilinear | 380 | N/A |
+| tf_efficientnet_b1_ns | 81.388 (18.612) | 95.738 (4.262) | 7.79 | bicubic | 240 | 0.88 |
+| tf_efficientnet_el | 80.534 (19.466) | 95.190 (4.810) | 10.59 | bicubic | 300 | 0.904 |
+| tf_efficientnet_el *tfp | 80.476 (19.524) | 95.200 (4.800) | 10.59 | bicubic | 300 | N/A |
+| tf_efficientnet_b2_ap *tfp | 80.420 (19.580) | 95.040 (4.960) | 9.11 | bicubic | 260 | N/A |
+| tf_efficientnet_b2_ap | 80.306 (19.694) | 95.028 (4.972) | 9.11 | bicubic | 260 | 0.890 |
+| tf_efficientnet_b2 *tfp | 80.188 (19.812) | 94.974 (5.026) | 9.11 | bicubic | 260 | N/A |
+| tf_efficientnet_b2 | 80.086 (19.914) | 94.908 (5.092) | 9.11 | bicubic | 260 | 0.890 |
+| tf_efficientnet_lite3 | 79.812 (20.188) | 94.914 (5.086) | 8.20 | bilinear | 300 | 0.904 |
+| tf_efficientnet_lite3 *tfp | 79.734 (20.266) | 94.838 (5.162) | 8.20 | bilinear | 300 | N/A |
+| tf_efficientnet_b1_ap *tfp | 79.532 (20.468) | 94.378 (5.622) | 7.79 | bicubic | 240 | N/A |
+| tf_efficientnet_cc_b1_8e *tfp | 79.464 (20.536)| 94.492 (5.508) | 39.7 | bicubic | 240 | 0.88 |
+| tf_efficientnet_cc_b1_8e | 79.298 (20.702) | 94.364 (5.636) | 39.7 | bicubic | 240 | 0.88 |
+| tf_efficientnet_b1_ap | 79.278 (20.722) | 94.308 (5.692) | 7.79 | bicubic | 240 | 0.88 |
+| tf_efficientnet_b1 *tfp | 79.172 (20.828) | 94.450 (5.550) | 7.79 | bicubic | 240 | N/A |
+| tf_efficientnet_em *tfp | 78.958 (21.042) | 94.458 (5.542) | 6.90 | bicubic | 240 | N/A |
+| tf_efficientnet_b0_ns *tfp | 78.806 (21.194) | 94.496 (5.504) | 5.29 | bicubic | 224 | N/A |
+| tf_mixnet_l *tfp | 78.846 (21.154) | 94.212 (5.788) | 7.33 | bilinear | 224 | N/A |
+| tf_efficientnet_b1 | 78.826 (21.174) | 94.198 (5.802) | 7.79 | bicubic | 240 | 0.88 |
+| tf_mixnet_l | 78.770 (21.230) | 94.004 (5.996) | 7.33 | bicubic | 224 | 0.875 |
+| tf_efficientnet_em | 78.742 (21.258) | 94.332 (5.668) | 6.90 | bicubic | 240 | 0.875 |
+| tf_efficientnet_b0_ns | 78.658 (21.342) | 94.376 (5.624) | 5.29 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_8e *tfp | 78.314 (21.686) | 93.790 (6.210) | 24.0 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_8e | 77.908 (22.092) | 93.656 (6.344) | 24.0 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_4e *tfp | 77.746 (22.254) | 93.552 (6.448) | 13.3 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_4e | 77.304 (22.696) | 93.332 (6.668) | 13.3 | bicubic | 224 | 0.875 |
+| tf_efficientnet_es *tfp | 77.616 (22.384) | 93.750 (6.250) | 5.44 | bicubic | 224 | N/A |
+| tf_efficientnet_lite2 *tfp | 77.544 (22.456) | 93.800 (6.200) | 6.09 | bilinear | 260 | N/A |
+| tf_efficientnet_lite2 | 77.460 (22.540) | 93.746 (6.254) | 6.09 | bicubic | 260 | 0.89 |
+| tf_efficientnet_b0_ap *tfp | 77.514 (22.486) | 93.576 (6.424) | 5.29 | bicubic | 224 | N/A |
+| tf_efficientnet_es | 77.264 (22.736) | 93.600 (6.400) | 5.44 | bicubic | 224 | N/A |
+| tf_efficientnet_b0 *tfp | 77.258 (22.742) | 93.478 (6.522) | 5.29 | bicubic | 224 | N/A |
+| tf_efficientnet_b0_ap | 77.084 (22.916) | 93.254 (6.746) | 5.29 | bicubic | 224 | 0.875 |
+| tf_mixnet_m *tfp | 77.072 (22.928) | 93.368 (6.632) | 5.01 | bilinear | 224 | N/A |
+| tf_mixnet_m | 76.950 (23.050) | 93.156 (6.844) | 5.01 | bicubic | 224 | 0.875 |
+| tf_efficientnet_b0 | 76.848 (23.152) | 93.228 (6.772) | 5.29 | bicubic | 224 | 0.875 |
+| tf_efficientnet_lite1 *tfp | 76.764 (23.236) | 93.326 (6.674) | 5.42 | bilinear | 240 | N/A |
+| tf_efficientnet_lite1 | 76.638 (23.362) | 93.232 (6.768) | 5.42 | bicubic | 240 | 0.882 |
+| tf_mixnet_s *tfp | 75.800 (24.200) | 92.788 (7.212) | 4.13 | bilinear | 224 | N/A |
+| tf_mobilenetv3_large_100 *tfp | 75.768 (24.232) | 92.710 (7.290) | 5.48 | bilinear | 224 | N/A |
+| tf_mixnet_s | 75.648 (24.352) | 92.636 (7.364) | 4.13 | bicubic | 224 | 0.875 |
+| tf_mobilenetv3_large_100 | 75.516 (24.484) | 92.600 (7.400) | 5.48 | bilinear | 224 | 0.875 |
+| tf_efficientnet_lite0 *tfp | 75.074 (24.926) | 92.314 (7.686) | 4.65 | bilinear | 224 | N/A |
+| tf_efficientnet_lite0 | 74.842 (25.158) | 92.170 (7.830) | 4.65 | bicubic | 224 | 0.875 |
+| tf_mobilenetv3_large_075 *tfp | 73.730 (26.270) | 91.616 (8.384) | 3.99 | bilinear | 224 |N/A |
+| tf_mobilenetv3_large_075 | 73.442 (26.558) | 91.352 (8.648) | 3.99 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_large_minimal_100 *tfp | 72.678 (27.322) | 90.860 (9.140) | 3.92 | bilinear | 224 | N/A |
+| tf_mobilenetv3_large_minimal_100 | 72.244 (27.756) | 90.636 (9.364) | 3.92 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_small_100 *tfp | 67.918 (32.082) | 87.958 (12.042 | 2.54 | bilinear | 224 | N/A |
+| tf_mobilenetv3_small_100 | 67.918 (32.082) | 87.662 (12.338) | 2.54 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_small_075 *tfp | 66.142 (33.858) | 86.498 (13.502) | 2.04 | bilinear | 224 | N/A |
+| tf_mobilenetv3_small_075 | 65.718 (34.282) | 86.136 (13.864) | 2.04 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_small_minimal_100 *tfp | 63.378 (36.622) | 84.802 (15.198) | 2.04 | bilinear | 224 | N/A |
+| tf_mobilenetv3_small_minimal_100 | 62.898 (37.102) | 84.230 (15.770) | 2.04 | bilinear | 224 | 0.875 |
+
+
+*tfp models validated with `tf-preprocessing` pipeline
+
+Google tf and tflite weights ported from official Tensorflow repositories
+* https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+* https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet
+* https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet
+
+## Usage
+
+### Environment
+
+All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x, 3.8.x.
+
+Users have reported that a Python 3 Anaconda install in Windows works. I have not verified this myself.
+
+PyTorch versions 1.4, 1.5, 1.6 have been tested with this code.
+
+I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda:
+```
+conda create -n torch-env
+conda activate torch-env
+conda install -c pytorch pytorch torchvision cudatoolkit=10.2
+```
+
+### PyTorch Hub
+
+Models can be accessed via the PyTorch Hub API
+
+```
+>>> torch.hub.list('rwightman/gen-efficientnet-pytorch')
+['efficientnet_b0', ...]
+>>> model = torch.hub.load('rwightman/gen-efficientnet-pytorch', 'efficientnet_b0', pretrained=True)
+>>> model.eval()
+>>> output = model(torch.randn(1,3,224,224))
+```
+
+### Pip
+This package can be installed via pip.
+
+Install (after conda env/install):
+```
+pip install geffnet
+```
+
+Eval use:
+```
+>>> import geffnet
+>>> m = geffnet.create_model('mobilenetv3_large_100', pretrained=True)
+>>> m.eval()
+```
+
+Train use:
+```
+>>> import geffnet
+>>> # models can also be created by using the entrypoint directly
+>>> m = geffnet.efficientnet_b2(pretrained=True, drop_rate=0.25, drop_connect_rate=0.2)
+>>> m.train()
+```
+
+Create in a nn.Sequential container, for fast.ai, etc:
+```
+>>> import geffnet
+>>> m = geffnet.mixnet_l(pretrained=True, drop_rate=0.25, drop_connect_rate=0.2, as_sequential=True)
+```
+
+### Exporting
+
+Scripts are included to
+* export models to ONNX (`onnx_export.py`)
+* optimized ONNX graph (`onnx_optimize.py` or `onnx_validate.py` w/ `--onnx-output-opt` arg)
+* validate with ONNX runtime (`onnx_validate.py`)
+* convert ONNX model to Caffe2 (`onnx_to_caffe.py`)
+* validate in Caffe2 (`caffe2_validate.py`)
+* benchmark in Caffe2 w/ FLOPs, parameters output (`caffe2_benchmark.py`)
+
+As an example, to export the MobileNet-V3 pretrained model and then run an Imagenet validation:
+```
+python onnx_export.py --model mobilenetv3_large_100 ./mobilenetv3_100.onnx
+python onnx_validate.py /imagenet/validation/ --onnx-input ./mobilenetv3_100.onnx
+```
+
+These scripts were tested to be working as of PyTorch 1.6 and ONNX 1.7 w/ ONNX runtime 1.4. Caffe2 compatible
+export now requires additional args mentioned in the export script (not needed in earlier versions).
+
+#### Export Notes
+1. The TF ported weights with the 'SAME' conv padding activated cannot be exported to ONNX unless `_EXPORTABLE` flag in `config.py` is set to True. Use `config.set_exportable(True)` as in the `onnx_export.py` script.
+2. TF ported models with 'SAME' padding will have the padding fixed at export time to the resolution used for export. Even though dynamic padding is supported in opset >= 11, I can't get it working.
+3. ONNX optimize facility doesn't work reliably in PyTorch 1.6 / ONNX 1.7. Fortunately, the onnxruntime based inference is working very well now and includes on the fly optimization.
+3. ONNX / Caffe2 export/import frequently breaks with different PyTorch and ONNX version releases. Please check their respective issue trackers before filing issues here.
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/caffe2_benchmark.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/caffe2_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb555acef42578e49430085c553678ca6feb0d1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/caffe2_benchmark.py
@@ -0,0 +1,65 @@
+""" Caffe2 validation script
+
+This script runs Caffe2 benchmark on exported ONNX model.
+It is a useful tool for reporting model FLOPS.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+from caffe2.python import core, workspace, model_helper
+from caffe2.proto import caffe2_pb2
+
+
+parser = argparse.ArgumentParser(description='Caffe2 Model Benchmark')
+parser.add_argument('--c2-prefix', default='', type=str, metavar='NAME',
+ help='caffe2 model pb name prefix')
+parser.add_argument('--c2-init', default='', type=str, metavar='PATH',
+ help='caffe2 model init .pb')
+parser.add_argument('--c2-predict', default='', type=str, metavar='PATH',
+ help='caffe2 model predict .pb')
+parser.add_argument('-b', '--batch-size', default=1, type=int,
+ metavar='N', help='mini-batch size (default: 1)')
+parser.add_argument('--img-size', default=224, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+
+
+def main():
+ args = parser.parse_args()
+ args.gpu_id = 0
+ if args.c2_prefix:
+ args.c2_init = args.c2_prefix + '.init.pb'
+ args.c2_predict = args.c2_prefix + '.predict.pb'
+
+ model = model_helper.ModelHelper(name="le_net", init_params=False)
+
+ # Bring in the init net from init_net.pb
+ init_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_init, "rb") as f:
+ init_net_proto.ParseFromString(f.read())
+ model.param_init_net = core.Net(init_net_proto)
+
+ # bring in the predict net from predict_net.pb
+ predict_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_predict, "rb") as f:
+ predict_net_proto.ParseFromString(f.read())
+ model.net = core.Net(predict_net_proto)
+
+ # CUDA performance not impressive
+ #device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id)
+ #model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+ #model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+
+ input_blob = model.net.external_inputs[0]
+ model.param_init_net.GaussianFill(
+ [],
+ input_blob.GetUnscopedName(),
+ shape=(args.batch_size, 3, args.img_size, args.img_size),
+ mean=0.0,
+ std=1.0)
+ workspace.RunNetOnce(model.param_init_net)
+ workspace.CreateNet(model.net, overwrite=True)
+ workspace.BenchmarkNet(model.net.Proto().name, 5, 20, True)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/caffe2_validate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/caffe2_validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..2459648ec15c5ec0642ef35418c22c575b9391ac
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/caffe2_validate.py
@@ -0,0 +1,138 @@
+""" Caffe2 validation script
+
+This script is created to verify exported ONNX models running in Caffe2
+It utilizes the same PyTorch dataloader/processing pipeline for a
+fair comparison against the originals.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import numpy as np
+from caffe2.python import core, workspace, model_helper
+from caffe2.proto import caffe2_pb2
+from data import create_loader, resolve_data_config, Dataset
+from utils import AverageMeter
+import time
+
+parser = argparse.ArgumentParser(description='Caffe2 ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--c2-prefix', default='', type=str, metavar='NAME',
+ help='caffe2 model pb name prefix')
+parser.add_argument('--c2-init', default='', type=str, metavar='PATH',
+ help='caffe2 model init .pb')
+parser.add_argument('--c2-predict', default='', type=str, metavar='PATH',
+ help='caffe2 model predict .pb')
+parser.add_argument('-j', '--workers', default=2, type=int, metavar='N',
+ help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+ metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT',
+ help='Override default crop pct of 0.875')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true',
+ help='use tensorflow mnasnet preporcessing')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+ metavar='N', help='print frequency (default: 10)')
+
+
+def main():
+ args = parser.parse_args()
+ args.gpu_id = 0
+ if args.c2_prefix:
+ args.c2_init = args.c2_prefix + '.init.pb'
+ args.c2_predict = args.c2_prefix + '.predict.pb'
+
+ model = model_helper.ModelHelper(name="validation_net", init_params=False)
+
+ # Bring in the init net from init_net.pb
+ init_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_init, "rb") as f:
+ init_net_proto.ParseFromString(f.read())
+ model.param_init_net = core.Net(init_net_proto)
+
+ # bring in the predict net from predict_net.pb
+ predict_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_predict, "rb") as f:
+ predict_net_proto.ParseFromString(f.read())
+ model.net = core.Net(predict_net_proto)
+
+ data_config = resolve_data_config(None, args)
+ loader = create_loader(
+ Dataset(args.data, load_bytes=args.tf_preprocessing),
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ use_prefetcher=False,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ crop_pct=data_config['crop_pct'],
+ tensorflow_preprocessing=args.tf_preprocessing)
+
+ # this is so obvious, wonderful interface
+ input_blob = model.net.external_inputs[0]
+ output_blob = model.net.external_outputs[0]
+
+ if True:
+ device_opts = None
+ else:
+ # CUDA is crashing, no idea why, awesome error message, give it a try for kicks
+ device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id)
+ model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+ model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+
+ model.param_init_net.GaussianFill(
+ [], input_blob.GetUnscopedName(),
+ shape=(1,) + data_config['input_size'], mean=0.0, std=1.0)
+ workspace.RunNetOnce(model.param_init_net)
+ workspace.CreateNet(model.net, overwrite=True)
+
+ batch_time = AverageMeter()
+ top1 = AverageMeter()
+ top5 = AverageMeter()
+ end = time.time()
+ for i, (input, target) in enumerate(loader):
+ # run the net and return prediction
+ caffe2_in = input.data.numpy()
+ workspace.FeedBlob(input_blob, caffe2_in, device_opts)
+ workspace.RunNet(model.net, num_iter=1)
+ output = workspace.FetchBlob(output_blob)
+
+ # measure accuracy and record loss
+ prec1, prec5 = accuracy_np(output.data, target.numpy())
+ top1.update(prec1.item(), input.size(0))
+ top5.update(prec5.item(), input.size(0))
+
+ # measure elapsed time
+ batch_time.update(time.time() - end)
+ end = time.time()
+
+ if i % args.print_freq == 0:
+ print('Test: [{0}/{1}]\t'
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s, {ms_avg:.3f} ms/sample) \t'
+ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+ 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+ i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg,
+ ms_avg=100 * batch_time.avg / input.size(0), top1=top1, top5=top5))
+
+ print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format(
+ top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg))
+
+
+def accuracy_np(output, target):
+ max_indices = np.argsort(output, axis=1)[:, ::-1]
+ top5 = 100 * np.equal(max_indices[:, :5], target[:, np.newaxis]).sum(axis=1).mean()
+ top1 = 100 * np.equal(max_indices[:, 0], target).mean()
+ return top1, top5
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c53c1e046d1a5de6d11400f3b294f834d3a2b3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/__init__.py
@@ -0,0 +1,5 @@
+from .gen_efficientnet import *
+from .mobilenetv3 import *
+from .model_factory import create_model
+from .config import is_exportable, is_scriptable, set_exportable, set_scriptable
+from .activations import *
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2167fd724fe17450444f8e79b12bbdb6b0b37ebd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/__init__.py
@@ -0,0 +1,137 @@
+from geffnet import config
+from geffnet.activations.activations_me import *
+from geffnet.activations.activations_jit import *
+from geffnet.activations.activations import *
+import torch
+
+_has_silu = 'silu' in dir(torch.nn.functional)
+
+_ACT_FN_DEFAULT = dict(
+ silu=F.silu if _has_silu else swish,
+ swish=F.silu if _has_silu else swish,
+ mish=mish,
+ relu=F.relu,
+ relu6=F.relu6,
+ sigmoid=sigmoid,
+ tanh=tanh,
+ hard_sigmoid=hard_sigmoid,
+ hard_swish=hard_swish,
+)
+
+_ACT_FN_JIT = dict(
+ silu=F.silu if _has_silu else swish_jit,
+ swish=F.silu if _has_silu else swish_jit,
+ mish=mish_jit,
+)
+
+_ACT_FN_ME = dict(
+ silu=F.silu if _has_silu else swish_me,
+ swish=F.silu if _has_silu else swish_me,
+ mish=mish_me,
+ hard_swish=hard_swish_me,
+ hard_sigmoid_jit=hard_sigmoid_me,
+)
+
+_ACT_LAYER_DEFAULT = dict(
+ silu=nn.SiLU if _has_silu else Swish,
+ swish=nn.SiLU if _has_silu else Swish,
+ mish=Mish,
+ relu=nn.ReLU,
+ relu6=nn.ReLU6,
+ sigmoid=Sigmoid,
+ tanh=Tanh,
+ hard_sigmoid=HardSigmoid,
+ hard_swish=HardSwish,
+)
+
+_ACT_LAYER_JIT = dict(
+ silu=nn.SiLU if _has_silu else SwishJit,
+ swish=nn.SiLU if _has_silu else SwishJit,
+ mish=MishJit,
+)
+
+_ACT_LAYER_ME = dict(
+ silu=nn.SiLU if _has_silu else SwishMe,
+ swish=nn.SiLU if _has_silu else SwishMe,
+ mish=MishMe,
+ hard_swish=HardSwishMe,
+ hard_sigmoid=HardSigmoidMe
+)
+
+_OVERRIDE_FN = dict()
+_OVERRIDE_LAYER = dict()
+
+
+def add_override_act_fn(name, fn):
+ global _OVERRIDE_FN
+ _OVERRIDE_FN[name] = fn
+
+
+def update_override_act_fn(overrides):
+ assert isinstance(overrides, dict)
+ global _OVERRIDE_FN
+ _OVERRIDE_FN.update(overrides)
+
+
+def clear_override_act_fn():
+ global _OVERRIDE_FN
+ _OVERRIDE_FN = dict()
+
+
+def add_override_act_layer(name, fn):
+ _OVERRIDE_LAYER[name] = fn
+
+
+def update_override_act_layer(overrides):
+ assert isinstance(overrides, dict)
+ global _OVERRIDE_LAYER
+ _OVERRIDE_LAYER.update(overrides)
+
+
+def clear_override_act_layer():
+ global _OVERRIDE_LAYER
+ _OVERRIDE_LAYER = dict()
+
+
+def get_act_fn(name='relu'):
+ """ Activation Function Factory
+ Fetching activation fns by name with this function allows export or torch script friendly
+ functions to be returned dynamically based on current config.
+ """
+ if name in _OVERRIDE_FN:
+ return _OVERRIDE_FN[name]
+ use_me = not (config.is_exportable() or config.is_scriptable() or config.is_no_jit())
+ if use_me and name in _ACT_FN_ME:
+ # If not exporting or scripting the model, first look for a memory optimized version
+ # activation with custom autograd, then fallback to jit scripted, then a Python or Torch builtin
+ return _ACT_FN_ME[name]
+ if config.is_exportable() and name in ('silu', 'swish'):
+ # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
+ return swish
+ use_jit = not (config.is_exportable() or config.is_no_jit())
+ # NOTE: export tracing should work with jit scripted components, but I keep running into issues
+ if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting
+ return _ACT_FN_JIT[name]
+ return _ACT_FN_DEFAULT[name]
+
+
+def get_act_layer(name='relu'):
+ """ Activation Layer Factory
+ Fetching activation layers by name with this function allows export or torch script friendly
+ functions to be returned dynamically based on current config.
+ """
+ if name in _OVERRIDE_LAYER:
+ return _OVERRIDE_LAYER[name]
+ use_me = not (config.is_exportable() or config.is_scriptable() or config.is_no_jit())
+ if use_me and name in _ACT_LAYER_ME:
+ return _ACT_LAYER_ME[name]
+ if config.is_exportable() and name in ('silu', 'swish'):
+ # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
+ return Swish
+ use_jit = not (config.is_exportable() or config.is_no_jit())
+ # NOTE: export tracing should work with jit scripted components, but I keep running into issues
+ if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting
+ return _ACT_LAYER_JIT[name]
+ return _ACT_LAYER_DEFAULT[name]
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0ae1758b9537a0c200a1ff9cb4824efb5258ea9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations.py
@@ -0,0 +1,102 @@
+""" Activations
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+Copyright 2020 Ross Wightman
+"""
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+def swish(x, inplace: bool = False):
+ """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
+ and also as Swish (https://arxiv.org/abs/1710.05941).
+
+ TODO Rename to SiLU with addition to PyTorch
+ """
+ return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
+
+
+class Swish(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Swish, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return swish(x, self.inplace)
+
+
+def mish(x, inplace: bool = False):
+ """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+ """
+ return x.mul(F.softplus(x).tanh())
+
+
+class Mish(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Mish, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return mish(x, self.inplace)
+
+
+def sigmoid(x, inplace: bool = False):
+ return x.sigmoid_() if inplace else x.sigmoid()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Sigmoid(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Sigmoid, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return x.sigmoid_() if self.inplace else x.sigmoid()
+
+
+def tanh(x, inplace: bool = False):
+ return x.tanh_() if inplace else x.tanh()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Tanh(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Tanh, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return x.tanh_() if self.inplace else x.tanh()
+
+
+def hard_swish(x, inplace: bool = False):
+ inner = F.relu6(x + 3.).div_(6.)
+ return x.mul_(inner) if inplace else x.mul(inner)
+
+
+class HardSwish(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSwish, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return hard_swish(x, self.inplace)
+
+
+def hard_sigmoid(x, inplace: bool = False):
+ if inplace:
+ return x.add_(3.).clamp_(0., 6.).div_(6.)
+ else:
+ return F.relu6(x + 3.) / 6.
+
+
+class HardSigmoid(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSigmoid, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return hard_sigmoid(x, self.inplace)
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations_jit.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25d0fa87db91b75f1346f5579090687287dd025
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations_jit.py
@@ -0,0 +1,79 @@
+""" Activations (jit)
+
+A collection of jit-scripted activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not
+currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted
+versions if they contain in-place ops.
+
+Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+__all__ = ['swish_jit', 'SwishJit', 'mish_jit', 'MishJit',
+ 'hard_sigmoid_jit', 'HardSigmoidJit', 'hard_swish_jit', 'HardSwishJit']
+
+
+@torch.jit.script
+def swish_jit(x, inplace: bool = False):
+ """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
+ and also as Swish (https://arxiv.org/abs/1710.05941).
+
+ TODO Rename to SiLU with addition to PyTorch
+ """
+ return x.mul(x.sigmoid())
+
+
+@torch.jit.script
+def mish_jit(x, _inplace: bool = False):
+ """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+ """
+ return x.mul(F.softplus(x).tanh())
+
+
+class SwishJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(SwishJit, self).__init__()
+
+ def forward(self, x):
+ return swish_jit(x)
+
+
+class MishJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(MishJit, self).__init__()
+
+ def forward(self, x):
+ return mish_jit(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit(x, inplace: bool = False):
+ # return F.relu6(x + 3.) / 6.
+ return (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster?
+
+
+class HardSigmoidJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSigmoidJit, self).__init__()
+
+ def forward(self, x):
+ return hard_sigmoid_jit(x)
+
+
+@torch.jit.script
+def hard_swish_jit(x, inplace: bool = False):
+ # return x * (F.relu6(x + 3.) / 6)
+ return x * (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster?
+
+
+class HardSwishJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSwishJit, self).__init__()
+
+ def forward(self, x):
+ return hard_swish_jit(x)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations_me.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations_me.py
new file mode 100644
index 0000000000000000000000000000000000000000..45dc472a1f8d3c3539fca746124482ade24c8613
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/activations/activations_me.py
@@ -0,0 +1,174 @@
+""" Activations (memory-efficient w/ custom autograd)
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+These activations are not compatible with jit scripting or ONNX export of the model, please use either
+the JIT or basic versions of the activations.
+
+Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+__all__ = ['swish_me', 'SwishMe', 'mish_me', 'MishMe',
+ 'hard_sigmoid_me', 'HardSigmoidMe', 'hard_swish_me', 'HardSwishMe']
+
+
+@torch.jit.script
+def swish_jit_fwd(x):
+ return x.mul(torch.sigmoid(x))
+
+
+@torch.jit.script
+def swish_jit_bwd(x, grad_output):
+ x_sigmoid = torch.sigmoid(x)
+ return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
+
+
+class SwishJitAutoFn(torch.autograd.Function):
+ """ torch.jit.script optimised Swish w/ memory-efficient checkpoint
+ Inspired by conversation btw Jeremy Howard & Adam Pazske
+ https://twitter.com/jeremyphoward/status/1188251041835315200
+
+ Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
+ and also as Swish (https://arxiv.org/abs/1710.05941).
+
+ TODO Rename to SiLU with addition to PyTorch
+ """
+
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return swish_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return swish_jit_bwd(x, grad_output)
+
+
+def swish_me(x, inplace=False):
+ return SwishJitAutoFn.apply(x)
+
+
+class SwishMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(SwishMe, self).__init__()
+
+ def forward(self, x):
+ return SwishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def mish_jit_fwd(x):
+ return x.mul(torch.tanh(F.softplus(x)))
+
+
+@torch.jit.script
+def mish_jit_bwd(x, grad_output):
+ x_sigmoid = torch.sigmoid(x)
+ x_tanh_sp = F.softplus(x).tanh()
+ return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+
+class MishJitAutoFn(torch.autograd.Function):
+ """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+ A memory efficient, jit scripted variant of Mish
+ """
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return mish_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return mish_jit_bwd(x, grad_output)
+
+
+def mish_me(x, inplace=False):
+ return MishJitAutoFn.apply(x)
+
+
+class MishMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(MishMe, self).__init__()
+
+ def forward(self, x):
+ return MishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_fwd(x, inplace: bool = False):
+ return (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_bwd(x, grad_output):
+ m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6.
+ return grad_output * m
+
+
+class HardSigmoidJitAutoFn(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return hard_sigmoid_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return hard_sigmoid_jit_bwd(x, grad_output)
+
+
+def hard_sigmoid_me(x, inplace: bool = False):
+ return HardSigmoidJitAutoFn.apply(x)
+
+
+class HardSigmoidMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSigmoidMe, self).__init__()
+
+ def forward(self, x):
+ return HardSigmoidJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_swish_jit_fwd(x):
+ return x * (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_swish_jit_bwd(x, grad_output):
+ m = torch.ones_like(x) * (x >= 3.)
+ m = torch.where((x >= -3.) & (x <= 3.), x / 3. + .5, m)
+ return grad_output * m
+
+
+class HardSwishJitAutoFn(torch.autograd.Function):
+ """A memory efficient, jit-scripted HardSwish activation"""
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return hard_swish_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return hard_swish_jit_bwd(x, grad_output)
+
+
+def hard_swish_me(x, inplace=False):
+ return HardSwishJitAutoFn.apply(x)
+
+
+class HardSwishMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSwishMe, self).__init__()
+
+ def forward(self, x):
+ return HardSwishJitAutoFn.apply(x)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/config.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ad21bbcbbc28f1b490b930ec369ccbf87f122d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/config.py
@@ -0,0 +1,123 @@
+""" Global layer config state
+"""
+from typing import Any, Optional
+
+__all__ = [
+ 'is_exportable', 'is_scriptable', 'is_no_jit', 'layer_config_kwargs',
+ 'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config'
+]
+
+# Set to True if prefer to have layers with no jit optimization (includes activations)
+_NO_JIT = False
+
+# Set to True if prefer to have activation layers with no jit optimization
+# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying
+# the jit flags so far are activations. This will change as more layers are updated and/or added.
+_NO_ACTIVATION_JIT = False
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+# Set to True if wanting to use torch.jit.script on a model
+_SCRIPTABLE = False
+
+
+def is_no_jit():
+ return _NO_JIT
+
+
+class set_no_jit:
+ def __init__(self, mode: bool) -> None:
+ global _NO_JIT
+ self.prev = _NO_JIT
+ _NO_JIT = mode
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _NO_JIT
+ _NO_JIT = self.prev
+ return False
+
+
+def is_exportable():
+ return _EXPORTABLE
+
+
+class set_exportable:
+ def __init__(self, mode: bool) -> None:
+ global _EXPORTABLE
+ self.prev = _EXPORTABLE
+ _EXPORTABLE = mode
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _EXPORTABLE
+ _EXPORTABLE = self.prev
+ return False
+
+
+def is_scriptable():
+ return _SCRIPTABLE
+
+
+class set_scriptable:
+ def __init__(self, mode: bool) -> None:
+ global _SCRIPTABLE
+ self.prev = _SCRIPTABLE
+ _SCRIPTABLE = mode
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _SCRIPTABLE
+ _SCRIPTABLE = self.prev
+ return False
+
+
+class set_layer_config:
+ """ Layer config context manager that allows setting all layer config flags at once.
+ If a flag arg is None, it will not change the current value.
+ """
+ def __init__(
+ self,
+ scriptable: Optional[bool] = None,
+ exportable: Optional[bool] = None,
+ no_jit: Optional[bool] = None,
+ no_activation_jit: Optional[bool] = None):
+ global _SCRIPTABLE
+ global _EXPORTABLE
+ global _NO_JIT
+ global _NO_ACTIVATION_JIT
+ self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT
+ if scriptable is not None:
+ _SCRIPTABLE = scriptable
+ if exportable is not None:
+ _EXPORTABLE = exportable
+ if no_jit is not None:
+ _NO_JIT = no_jit
+ if no_activation_jit is not None:
+ _NO_ACTIVATION_JIT = no_activation_jit
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _SCRIPTABLE
+ global _EXPORTABLE
+ global _NO_JIT
+ global _NO_ACTIVATION_JIT
+ _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev
+ return False
+
+
+def layer_config_kwargs(kwargs):
+ """ Consume config kwargs and return contextmgr obj """
+ return set_layer_config(
+ scriptable=kwargs.pop('scriptable', None),
+ exportable=kwargs.pop('exportable', None),
+ no_jit=kwargs.pop('no_jit', None))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/conv2d_layers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/conv2d_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49afbf0baa39a605cdf78d9e52d5f697b643370
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/conv2d_layers.py
@@ -0,0 +1,304 @@
+""" Conv2D w/ SAME padding, CondConv, MixedConv
+
+A collection of conv layers and padding helpers needed by EfficientNet, MixNet, and
+MobileNetV3 models that maintain weight compatibility with original Tensorflow models.
+
+Copyright 2020 Ross Wightman
+"""
+import collections.abc
+import math
+from functools import partial
+from itertools import repeat
+from typing import Tuple, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import *
+
+
+# From PyTorch internals
+def _ntuple(n):
+ def parse(x):
+ if isinstance(x, collections.abc.Iterable):
+ return x
+ return tuple(repeat(x, n))
+ return parse
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
+ return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+def _get_padding(kernel_size, stride=1, dilation=1, **_):
+ padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+ return padding
+
+
+def _calc_same_pad(i: int, k: int, s: int, d: int):
+ return max((-(i // -s) - 1) * s + (k - 1) * d + 1 - i, 0)
+
+
+def _same_pad_arg(input_size, kernel_size, stride, dilation):
+ ih, iw = input_size
+ kh, kw = kernel_size
+ pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
+ pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
+ return [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
+
+
+def _split_channels(num_chan, num_groups):
+ split = [num_chan // num_groups for _ in range(num_groups)]
+ split[0] += num_chan - sum(split)
+ return split
+
+
+def conv2d_same(
+ x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
+ padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
+ ih, iw = x.size()[-2:]
+ kh, kw = weight.size()[-2:]
+ pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
+ pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
+ x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+ return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
+
+
+class Conv2dSame(nn.Conv2d):
+ """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+ """
+
+ # pylint: disable=unused-argument
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+ padding=0, dilation=1, groups=1, bias=True):
+ super(Conv2dSame, self).__init__(
+ in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+
+ def forward(self, x):
+ return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class Conv2dSameExport(nn.Conv2d):
+ """ ONNX export friendly Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+
+ NOTE: This does not currently work with torch.jit.script
+ """
+
+ # pylint: disable=unused-argument
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+ padding=0, dilation=1, groups=1, bias=True):
+ super(Conv2dSameExport, self).__init__(
+ in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+ self.pad = None
+ self.pad_input_size = (0, 0)
+
+ def forward(self, x):
+ input_size = x.size()[-2:]
+ if self.pad is None:
+ pad_arg = _same_pad_arg(input_size, self.weight.size()[-2:], self.stride, self.dilation)
+ self.pad = nn.ZeroPad2d(pad_arg)
+ self.pad_input_size = input_size
+
+ if self.pad is not None:
+ x = self.pad(x)
+ return F.conv2d(
+ x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+def get_padding_value(padding, kernel_size, **kwargs):
+ dynamic = False
+ if isinstance(padding, str):
+ # for any string padding, the padding will be calculated for you, one of three ways
+ padding = padding.lower()
+ if padding == 'same':
+ # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+ if _is_static_pad(kernel_size, **kwargs):
+ # static case, no extra overhead
+ padding = _get_padding(kernel_size, **kwargs)
+ else:
+ # dynamic padding
+ padding = 0
+ dynamic = True
+ elif padding == 'valid':
+ # 'VALID' padding, same as padding=0
+ padding = 0
+ else:
+ # Default to PyTorch style 'same'-ish symmetric padding
+ padding = _get_padding(kernel_size, **kwargs)
+ return padding, dynamic
+
+
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+ padding = kwargs.pop('padding', '')
+ kwargs.setdefault('bias', False)
+ padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+ if is_dynamic:
+ if is_exportable():
+ assert not is_scriptable()
+ return Conv2dSameExport(in_chs, out_chs, kernel_size, **kwargs)
+ else:
+ return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+ else:
+ return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+
+
+class MixedConv2d(nn.ModuleDict):
+ """ Mixed Grouped Convolution
+ Based on MDConv and GroupedConv in MixNet impl:
+ https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+ """
+
+ def __init__(self, in_channels, out_channels, kernel_size=3,
+ stride=1, padding='', dilation=1, depthwise=False, **kwargs):
+ super(MixedConv2d, self).__init__()
+
+ kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
+ num_groups = len(kernel_size)
+ in_splits = _split_channels(in_channels, num_groups)
+ out_splits = _split_channels(out_channels, num_groups)
+ self.in_channels = sum(in_splits)
+ self.out_channels = sum(out_splits)
+ for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
+ conv_groups = out_ch if depthwise else 1
+ self.add_module(
+ str(idx),
+ create_conv2d_pad(
+ in_ch, out_ch, k, stride=stride,
+ padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
+ )
+ self.splits = in_splits
+
+ def forward(self, x):
+ x_split = torch.split(x, self.splits, 1)
+ x_out = [conv(x_split[i]) for i, conv in enumerate(self.values())]
+ x = torch.cat(x_out, 1)
+ return x
+
+
+def get_condconv_initializer(initializer, num_experts, expert_shape):
+ def condconv_initializer(weight):
+ """CondConv initializer function."""
+ num_params = np.prod(expert_shape)
+ if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
+ weight.shape[1] != num_params):
+ raise (ValueError(
+ 'CondConv variables must have shape [num_experts, num_params]'))
+ for i in range(num_experts):
+ initializer(weight[i].view(expert_shape))
+ return condconv_initializer
+
+
+class CondConv2d(nn.Module):
+ """ Conditional Convolution
+ Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+
+ Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
+ https://github.com/pytorch/pytorch/issues/17983
+ """
+ __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
+
+ def __init__(self, in_channels, out_channels, kernel_size=3,
+ stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
+ super(CondConv2d, self).__init__()
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = _pair(kernel_size)
+ self.stride = _pair(stride)
+ padding_val, is_padding_dynamic = get_padding_value(
+ padding, kernel_size, stride=stride, dilation=dilation)
+ self.dynamic_padding = is_padding_dynamic # if in forward to work with torchscript
+ self.padding = _pair(padding_val)
+ self.dilation = _pair(dilation)
+ self.groups = groups
+ self.num_experts = num_experts
+
+ self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
+ weight_num_param = 1
+ for wd in self.weight_shape:
+ weight_num_param *= wd
+ self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
+
+ if bias:
+ self.bias_shape = (self.out_channels,)
+ self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
+ else:
+ self.register_parameter('bias', None)
+
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ init_weight = get_condconv_initializer(
+ partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
+ init_weight(self.weight)
+ if self.bias is not None:
+ fan_in = np.prod(self.weight_shape[1:])
+ bound = 1 / math.sqrt(fan_in)
+ init_bias = get_condconv_initializer(
+ partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
+ init_bias(self.bias)
+
+ def forward(self, x, routing_weights):
+ B, C, H, W = x.shape
+ weight = torch.matmul(routing_weights, self.weight)
+ new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+ weight = weight.view(new_weight_shape)
+ bias = None
+ if self.bias is not None:
+ bias = torch.matmul(routing_weights, self.bias)
+ bias = bias.view(B * self.out_channels)
+ # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+ x = x.view(1, B * C, H, W)
+ if self.dynamic_padding:
+ out = conv2d_same(
+ x, weight, bias, stride=self.stride, padding=self.padding,
+ dilation=self.dilation, groups=self.groups * B)
+ else:
+ out = F.conv2d(
+ x, weight, bias, stride=self.stride, padding=self.padding,
+ dilation=self.dilation, groups=self.groups * B)
+ out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+ # Literal port (from TF definition)
+ # x = torch.split(x, 1, 0)
+ # weight = torch.split(weight, 1, 0)
+ # if self.bias is not None:
+ # bias = torch.matmul(routing_weights, self.bias)
+ # bias = torch.split(bias, 1, 0)
+ # else:
+ # bias = [None] * B
+ # out = []
+ # for xi, wi, bi in zip(x, weight, bias):
+ # wi = wi.view(*self.weight_shape)
+ # if bi is not None:
+ # bi = bi.view(*self.bias_shape)
+ # out.append(self.conv_fn(
+ # xi, wi, bi, stride=self.stride, padding=self.padding,
+ # dilation=self.dilation, groups=self.groups))
+ # out = torch.cat(out, 0)
+ return out
+
+
+def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+ assert 'groups' not in kwargs # only use 'depthwise' bool arg
+ if isinstance(kernel_size, list):
+ assert 'num_experts' not in kwargs # MixNet + CondConv combo not supported currently
+ # We're going to use only lists for defining the MixedConv2d kernel groups,
+ # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
+ m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+ else:
+ depthwise = kwargs.pop('depthwise', False)
+ groups = out_chs if depthwise else 1
+ if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
+ m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+ else:
+ m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+ return m
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/efficientnet_builder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/efficientnet_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..56df5ae5a8038ad8cc4965d2e50c7bfc48b2f292
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/efficientnet_builder.py
@@ -0,0 +1,683 @@
+""" EfficientNet / MobileNetV3 Blocks and Builder
+
+Copyright 2020 Ross Wightman
+"""
+import re
+from copy import deepcopy
+
+from .conv2d_layers import *
+from geffnet.activations import *
+
+__all__ = ['get_bn_args_tf', 'resolve_bn_args', 'resolve_se_args', 'resolve_act_layer', 'make_divisible',
+ 'round_channels', 'drop_connect', 'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv',
+ 'InvertedResidual', 'CondConvResidual', 'EdgeResidual', 'EfficientNetBuilder', 'decode_arch_def',
+ 'initialize_weight_default', 'initialize_weight_goog', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT'
+]
+
+# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
+# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
+# NOTE: momentum varies btw .99 and .9997 depending on source
+# .99 in official TF TPU impl
+# .9997 (/w .999 in search space) for paper
+#
+# PyTorch defaults are momentum = .1, eps = 1e-5
+#
+BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
+BN_EPS_TF_DEFAULT = 1e-3
+_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)
+
+
+def get_bn_args_tf():
+ return _BN_ARGS_TF.copy()
+
+
+def resolve_bn_args(kwargs):
+ bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
+ bn_momentum = kwargs.pop('bn_momentum', None)
+ if bn_momentum is not None:
+ bn_args['momentum'] = bn_momentum
+ bn_eps = kwargs.pop('bn_eps', None)
+ if bn_eps is not None:
+ bn_args['eps'] = bn_eps
+ return bn_args
+
+
+_SE_ARGS_DEFAULT = dict(
+ gate_fn=sigmoid,
+ act_layer=None, # None == use containing block's activation layer
+ reduce_mid=False,
+ divisor=1)
+
+
+def resolve_se_args(kwargs, in_chs, act_layer=None):
+ se_kwargs = kwargs.copy() if kwargs is not None else {}
+ # fill in args that aren't specified with the defaults
+ for k, v in _SE_ARGS_DEFAULT.items():
+ se_kwargs.setdefault(k, v)
+ # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
+ if not se_kwargs.pop('reduce_mid'):
+ se_kwargs['reduced_base_chs'] = in_chs
+ # act_layer override, if it remains None, the containing block's act_layer will be used
+ if se_kwargs['act_layer'] is None:
+ assert act_layer is not None
+ se_kwargs['act_layer'] = act_layer
+ return se_kwargs
+
+
+def resolve_act_layer(kwargs, default='relu'):
+ act_layer = kwargs.pop('act_layer', default)
+ if isinstance(act_layer, str):
+ act_layer = get_act_layer(act_layer)
+ return act_layer
+
+
+def make_divisible(v: int, divisor: int = 8, min_value: int = None):
+ min_value = min_value or divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ if new_v < 0.9 * v: # ensure round down does not go down by more than 10%.
+ new_v += divisor
+ return new_v
+
+
+def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
+ """Round number of filters based on depth multiplier."""
+ if not multiplier:
+ return channels
+ channels *= multiplier
+ return make_divisible(channels, divisor, channel_min)
+
+
+def drop_connect(inputs, training: bool = False, drop_connect_rate: float = 0.):
+ """Apply drop connect."""
+ if not training:
+ return inputs
+
+ keep_prob = 1 - drop_connect_rate
+ random_tensor = keep_prob + torch.rand(
+ (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
+ random_tensor.floor_() # binarize
+ output = inputs.div(keep_prob) * random_tensor
+ return output
+
+
+class SqueezeExcite(nn.Module):
+
+ def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None, act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1):
+ super(SqueezeExcite, self).__init__()
+ reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
+ self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+ self.act1 = act_layer(inplace=True)
+ self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+ self.gate_fn = gate_fn
+
+ def forward(self, x):
+ x_se = x.mean((2, 3), keepdim=True)
+ x_se = self.conv_reduce(x_se)
+ x_se = self.act1(x_se)
+ x_se = self.conv_expand(x_se)
+ x = x * self.gate_fn(x_se)
+ return x
+
+
+class ConvBnAct(nn.Module):
+ def __init__(self, in_chs, out_chs, kernel_size,
+ stride=1, pad_type='', act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+ super(ConvBnAct, self).__init__()
+ assert stride in [1, 2]
+ norm_kwargs = norm_kwargs or {}
+ self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, padding=pad_type)
+ self.bn1 = norm_layer(out_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ def forward(self, x):
+ x = self.conv(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+ return x
+
+
+class DepthwiseSeparableConv(nn.Module):
+ """ DepthwiseSeparable block
+ Used for DS convs in MobileNet-V1 and in the place of IR blocks with an expansion
+ factor of 1.0. This is an alternative to having a IR with optional first pw conv.
+ """
+ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+ pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
+ norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+ super(DepthwiseSeparableConv, self).__init__()
+ assert stride in [1, 2]
+ norm_kwargs = norm_kwargs or {}
+ self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
+ self.drop_connect_rate = drop_connect_rate
+
+ self.conv_dw = select_conv2d(
+ in_chs, in_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True)
+ self.bn1 = norm_layer(in_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ # Squeeze-and-excitation
+ if se_ratio is not None and se_ratio > 0.:
+ se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+ self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+ else:
+ self.se = nn.Identity()
+
+ self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+ self.bn2 = norm_layer(out_chs, **norm_kwargs)
+ self.act2 = act_layer(inplace=True) if pw_act else nn.Identity()
+
+ def forward(self, x):
+ residual = x
+
+ x = self.conv_dw(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ x = self.se(x)
+
+ x = self.conv_pw(x)
+ x = self.bn2(x)
+ x = self.act2(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+ return x
+
+
+class InvertedResidual(nn.Module):
+ """ Inverted residual block w/ optional SE"""
+
+ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+ exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+ se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+ conv_kwargs=None, drop_connect_rate=0.):
+ super(InvertedResidual, self).__init__()
+ norm_kwargs = norm_kwargs or {}
+ conv_kwargs = conv_kwargs or {}
+ mid_chs: int = make_divisible(in_chs * exp_ratio)
+ self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+ self.drop_connect_rate = drop_connect_rate
+
+ # Point-wise expansion
+ self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+ self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ # Depth-wise convolution
+ self.conv_dw = select_conv2d(
+ mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True, **conv_kwargs)
+ self.bn2 = norm_layer(mid_chs, **norm_kwargs)
+ self.act2 = act_layer(inplace=True)
+
+ # Squeeze-and-excitation
+ if se_ratio is not None and se_ratio > 0.:
+ se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+ self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+ else:
+ self.se = nn.Identity() # for jit.script compat
+
+ # Point-wise linear projection
+ self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+ self.bn3 = norm_layer(out_chs, **norm_kwargs)
+
+ def forward(self, x):
+ residual = x
+
+ # Point-wise expansion
+ x = self.conv_pw(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ # Depth-wise convolution
+ x = self.conv_dw(x)
+ x = self.bn2(x)
+ x = self.act2(x)
+
+ # Squeeze-and-excitation
+ x = self.se(x)
+
+ # Point-wise linear projection
+ x = self.conv_pwl(x)
+ x = self.bn3(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+ return x
+
+
+class CondConvResidual(InvertedResidual):
+ """ Inverted residual block w/ CondConv routing"""
+
+ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+ exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+ se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+ num_experts=0, drop_connect_rate=0.):
+
+ self.num_experts = num_experts
+ conv_kwargs = dict(num_experts=self.num_experts)
+
+ super(CondConvResidual, self).__init__(
+ in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, pad_type=pad_type,
+ act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
+ pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
+ norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
+ drop_connect_rate=drop_connect_rate)
+
+ self.routing_fn = nn.Linear(in_chs, self.num_experts)
+
+ def forward(self, x):
+ residual = x
+
+ # CondConv routing
+ pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
+ routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
+
+ # Point-wise expansion
+ x = self.conv_pw(x, routing_weights)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ # Depth-wise convolution
+ x = self.conv_dw(x, routing_weights)
+ x = self.bn2(x)
+ x = self.act2(x)
+
+ # Squeeze-and-excitation
+ x = self.se(x)
+
+ # Point-wise linear projection
+ x = self.conv_pwl(x, routing_weights)
+ x = self.bn3(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+ return x
+
+
+class EdgeResidual(nn.Module):
+ """ EdgeTPU Residual block with expansion convolution followed by pointwise-linear w/ stride"""
+
+ def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
+ se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+ super(EdgeResidual, self).__init__()
+ norm_kwargs = norm_kwargs or {}
+ mid_chs = make_divisible(fake_in_chs * exp_ratio) if fake_in_chs > 0 else make_divisible(in_chs * exp_ratio)
+ self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+ self.drop_connect_rate = drop_connect_rate
+
+ # Expansion convolution
+ self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
+ self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ # Squeeze-and-excitation
+ if se_ratio is not None and se_ratio > 0.:
+ se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+ self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+ else:
+ self.se = nn.Identity()
+
+ # Point-wise linear projection
+ self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, stride=stride, padding=pad_type)
+ self.bn2 = nn.BatchNorm2d(out_chs, **norm_kwargs)
+
+ def forward(self, x):
+ residual = x
+
+ # Expansion convolution
+ x = self.conv_exp(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ # Squeeze-and-excitation
+ x = self.se(x)
+
+ # Point-wise linear projection
+ x = self.conv_pwl(x)
+ x = self.bn2(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+
+ return x
+
+
+class EfficientNetBuilder:
+ """ Build Trunk Blocks for Efficient/Mobile Networks
+
+ This ended up being somewhat of a cross between
+ https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
+ and
+ https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
+
+ """
+
+ def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+ pad_type='', act_layer=None, se_kwargs=None,
+ norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+ self.channel_multiplier = channel_multiplier
+ self.channel_divisor = channel_divisor
+ self.channel_min = channel_min
+ self.pad_type = pad_type
+ self.act_layer = act_layer
+ self.se_kwargs = se_kwargs
+ self.norm_layer = norm_layer
+ self.norm_kwargs = norm_kwargs
+ self.drop_connect_rate = drop_connect_rate
+
+ # updated during build
+ self.in_chs = None
+ self.block_idx = 0
+ self.block_count = 0
+
+ def _round_channels(self, chs):
+ return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
+
+ def _make_block(self, ba):
+ bt = ba.pop('block_type')
+ ba['in_chs'] = self.in_chs
+ ba['out_chs'] = self._round_channels(ba['out_chs'])
+ if 'fake_in_chs' in ba and ba['fake_in_chs']:
+ # FIXME this is a hack to work around mismatch in origin impl input filters for EdgeTPU
+ ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
+ ba['norm_layer'] = self.norm_layer
+ ba['norm_kwargs'] = self.norm_kwargs
+ ba['pad_type'] = self.pad_type
+ # block act fn overrides the model default
+ ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+ assert ba['act_layer'] is not None
+ if bt == 'ir':
+ ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+ ba['se_kwargs'] = self.se_kwargs
+ if ba.get('num_experts', 0) > 0:
+ block = CondConvResidual(**ba)
+ else:
+ block = InvertedResidual(**ba)
+ elif bt == 'ds' or bt == 'dsa':
+ ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+ ba['se_kwargs'] = self.se_kwargs
+ block = DepthwiseSeparableConv(**ba)
+ elif bt == 'er':
+ ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+ ba['se_kwargs'] = self.se_kwargs
+ block = EdgeResidual(**ba)
+ elif bt == 'cn':
+ block = ConvBnAct(**ba)
+ else:
+ assert False, 'Uknkown block type (%s) while building model.' % bt
+ self.in_chs = ba['out_chs'] # update in_chs for arg of next block
+ return block
+
+ def _make_stack(self, stack_args):
+ blocks = []
+ # each stack (stage) contains a list of block arguments
+ for i, ba in enumerate(stack_args):
+ if i >= 1:
+ # only the first block in any stack can have a stride > 1
+ ba['stride'] = 1
+ block = self._make_block(ba)
+ blocks.append(block)
+ self.block_idx += 1 # incr global idx (across all stacks)
+ return nn.Sequential(*blocks)
+
+ def __call__(self, in_chs, block_args):
+ """ Build the blocks
+ Args:
+ in_chs: Number of input-channels passed to first block
+ block_args: A list of lists, outer list defines stages, inner
+ list contains strings defining block configuration(s)
+ Return:
+ List of block stacks (each stack wrapped in nn.Sequential)
+ """
+ self.in_chs = in_chs
+ self.block_count = sum([len(x) for x in block_args])
+ self.block_idx = 0
+ blocks = []
+ # outer list of block_args defines the stacks ('stages' by some conventions)
+ for stack_idx, stack in enumerate(block_args):
+ assert isinstance(stack, list)
+ stack = self._make_stack(stack)
+ blocks.append(stack)
+ return blocks
+
+
+def _parse_ksize(ss):
+ if ss.isdigit():
+ return int(ss)
+ else:
+ return [int(k) for k in ss.split('.')]
+
+
+def _decode_block_str(block_str):
+ """ Decode block definition string
+
+ Gets a list of block arg (dicts) through a string notation of arguments.
+ E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
+
+ All args can exist in any order with the exception of the leading string which
+ is assumed to indicate the block type.
+
+ leading string - block type (
+ ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
+ r - number of repeat blocks,
+ k - kernel size,
+ s - strides (1-9),
+ e - expansion ratio,
+ c - output channels,
+ se - squeeze/excitation ratio
+ n - activation fn ('re', 'r6', 'hs', or 'sw')
+ Args:
+ block_str: a string representation of block arguments.
+ Returns:
+ A list of block args (dicts)
+ Raises:
+ ValueError: if the string def not properly specified (TODO)
+ """
+ assert isinstance(block_str, str)
+ ops = block_str.split('_')
+ block_type = ops[0] # take the block type off the front
+ ops = ops[1:]
+ options = {}
+ noskip = False
+ for op in ops:
+ # string options being checked on individual basis, combine if they grow
+ if op == 'noskip':
+ noskip = True
+ elif op.startswith('n'):
+ # activation fn
+ key = op[0]
+ v = op[1:]
+ if v == 're':
+ value = get_act_layer('relu')
+ elif v == 'r6':
+ value = get_act_layer('relu6')
+ elif v == 'hs':
+ value = get_act_layer('hard_swish')
+ elif v == 'sw':
+ value = get_act_layer('swish')
+ else:
+ continue
+ options[key] = value
+ else:
+ # all numeric options
+ splits = re.split(r'(\d.*)', op)
+ if len(splits) >= 2:
+ key, value = splits[:2]
+ options[key] = value
+
+ # if act_layer is None, the model default (passed to model init) will be used
+ act_layer = options['n'] if 'n' in options else None
+ exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
+ pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
+ fake_in_chs = int(options['fc']) if 'fc' in options else 0 # FIXME hack to deal with in_chs issue in TPU def
+
+ num_repeat = int(options['r'])
+ # each type of block has different valid arguments, fill accordingly
+ if block_type == 'ir':
+ block_args = dict(
+ block_type=block_type,
+ dw_kernel_size=_parse_ksize(options['k']),
+ exp_kernel_size=exp_kernel_size,
+ pw_kernel_size=pw_kernel_size,
+ out_chs=int(options['c']),
+ exp_ratio=float(options['e']),
+ se_ratio=float(options['se']) if 'se' in options else None,
+ stride=int(options['s']),
+ act_layer=act_layer,
+ noskip=noskip,
+ )
+ if 'cc' in options:
+ block_args['num_experts'] = int(options['cc'])
+ elif block_type == 'ds' or block_type == 'dsa':
+ block_args = dict(
+ block_type=block_type,
+ dw_kernel_size=_parse_ksize(options['k']),
+ pw_kernel_size=pw_kernel_size,
+ out_chs=int(options['c']),
+ se_ratio=float(options['se']) if 'se' in options else None,
+ stride=int(options['s']),
+ act_layer=act_layer,
+ pw_act=block_type == 'dsa',
+ noskip=block_type == 'dsa' or noskip,
+ )
+ elif block_type == 'er':
+ block_args = dict(
+ block_type=block_type,
+ exp_kernel_size=_parse_ksize(options['k']),
+ pw_kernel_size=pw_kernel_size,
+ out_chs=int(options['c']),
+ exp_ratio=float(options['e']),
+ fake_in_chs=fake_in_chs,
+ se_ratio=float(options['se']) if 'se' in options else None,
+ stride=int(options['s']),
+ act_layer=act_layer,
+ noskip=noskip,
+ )
+ elif block_type == 'cn':
+ block_args = dict(
+ block_type=block_type,
+ kernel_size=int(options['k']),
+ out_chs=int(options['c']),
+ stride=int(options['s']),
+ act_layer=act_layer,
+ )
+ else:
+ assert False, 'Unknown block type (%s)' % block_type
+
+ return block_args, num_repeat
+
+
+def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
+ """ Per-stage depth scaling
+ Scales the block repeats in each stage. This depth scaling impl maintains
+ compatibility with the EfficientNet scaling method, while allowing sensible
+ scaling for other models that may have multiple block arg definitions in each stage.
+ """
+
+ # We scale the total repeat count for each stage, there may be multiple
+ # block arg defs per stage so we need to sum.
+ num_repeat = sum(repeats)
+ if depth_trunc == 'round':
+ # Truncating to int by rounding allows stages with few repeats to remain
+ # proportionally smaller for longer. This is a good choice when stage definitions
+ # include single repeat stages that we'd prefer to keep that way as long as possible
+ num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
+ else:
+ # The default for EfficientNet truncates repeats to int via 'ceil'.
+ # Any multiplier > 1.0 will result in an increased depth for every stage.
+ num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
+
+ # Proportionally distribute repeat count scaling to each block definition in the stage.
+ # Allocation is done in reverse as it results in the first block being less likely to be scaled.
+ # The first block makes less sense to repeat in most of the arch definitions.
+ repeats_scaled = []
+ for r in repeats[::-1]:
+ rs = max(1, round((r / num_repeat * num_repeat_scaled)))
+ repeats_scaled.append(rs)
+ num_repeat -= r
+ num_repeat_scaled -= rs
+ repeats_scaled = repeats_scaled[::-1]
+
+ # Apply the calculated scaling to each block arg in the stage
+ sa_scaled = []
+ for ba, rep in zip(stack_args, repeats_scaled):
+ sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
+ return sa_scaled
+
+
+def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False):
+ arch_args = []
+ for stack_idx, block_strings in enumerate(arch_def):
+ assert isinstance(block_strings, list)
+ stack_args = []
+ repeats = []
+ for block_str in block_strings:
+ assert isinstance(block_str, str)
+ ba, rep = _decode_block_str(block_str)
+ if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
+ ba['num_experts'] *= experts_multiplier
+ stack_args.append(ba)
+ repeats.append(rep)
+ if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
+ arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc))
+ else:
+ arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
+ return arch_args
+
+
+def initialize_weight_goog(m, n='', fix_group_fanout=True):
+ # weight init as per Tensorflow Official impl
+ # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+ if isinstance(m, CondConv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ if fix_group_fanout:
+ fan_out //= m.groups
+ init_weight_fn = get_condconv_initializer(
+ lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
+ init_weight_fn(m.weight)
+ if m.bias is not None:
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ if fix_group_fanout:
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1.0)
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Linear):
+ fan_out = m.weight.size(0) # fan-out
+ fan_in = 0
+ if 'routing_fn' in n:
+ fan_in = m.weight.size(1)
+ init_range = 1.0 / math.sqrt(fan_in + fan_out)
+ m.weight.data.uniform_(-init_range, init_range)
+ m.bias.data.zero_()
+
+
+def initialize_weight_default(m, n=''):
+ if isinstance(m, CondConv2d):
+ init_fn = get_condconv_initializer(partial(
+ nn.init.kaiming_normal_, mode='fan_out', nonlinearity='relu'), m.num_experts, m.weight_shape)
+ init_fn(m.weight)
+ elif isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1.0)
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Linear):
+ nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/gen_efficientnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/gen_efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0c7c810e3064fc06d61fdd710f30058c216467f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/gen_efficientnet.py
@@ -0,0 +1,1450 @@
+""" Generic Efficient Networks
+
+A generic MobileNet class with building blocks to support a variety of models:
+
+* EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent ports)
+ - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946
+ - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971
+ - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665
+ - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252
+
+* EfficientNet-Lite
+
+* MixNet (Small, Medium, and Large)
+ - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595
+
+* MNasNet B1, A1 (SE), Small
+ - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626
+
+* FBNet-C
+ - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443
+
+* Single-Path NAS Pixel1
+ - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877
+
+* And likely more...
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import layer_config_kwargs, is_scriptable
+from .conv2d_layers import select_conv2d
+from .helpers import load_pretrained
+from .efficientnet_builder import *
+
+__all__ = ['GenEfficientNet', 'mnasnet_050', 'mnasnet_075', 'mnasnet_100', 'mnasnet_b1', 'mnasnet_140',
+ 'semnasnet_050', 'semnasnet_075', 'semnasnet_100', 'mnasnet_a1', 'semnasnet_140', 'mnasnet_small',
+ 'mobilenetv2_100', 'mobilenetv2_140', 'mobilenetv2_110d', 'mobilenetv2_120d',
+ 'fbnetc_100', 'spnasnet_100', 'efficientnet_b0', 'efficientnet_b1', 'efficientnet_b2', 'efficientnet_b3',
+ 'efficientnet_b4', 'efficientnet_b5', 'efficientnet_b6', 'efficientnet_b7', 'efficientnet_b8',
+ 'efficientnet_l2', 'efficientnet_es', 'efficientnet_em', 'efficientnet_el',
+ 'efficientnet_cc_b0_4e', 'efficientnet_cc_b0_8e', 'efficientnet_cc_b1_8e',
+ 'efficientnet_lite0', 'efficientnet_lite1', 'efficientnet_lite2', 'efficientnet_lite3', 'efficientnet_lite4',
+ 'tf_efficientnet_b0', 'tf_efficientnet_b1', 'tf_efficientnet_b2', 'tf_efficientnet_b3',
+ 'tf_efficientnet_b4', 'tf_efficientnet_b5', 'tf_efficientnet_b6', 'tf_efficientnet_b7', 'tf_efficientnet_b8',
+ 'tf_efficientnet_b0_ap', 'tf_efficientnet_b1_ap', 'tf_efficientnet_b2_ap', 'tf_efficientnet_b3_ap',
+ 'tf_efficientnet_b4_ap', 'tf_efficientnet_b5_ap', 'tf_efficientnet_b6_ap', 'tf_efficientnet_b7_ap',
+ 'tf_efficientnet_b8_ap', 'tf_efficientnet_b0_ns', 'tf_efficientnet_b1_ns', 'tf_efficientnet_b2_ns',
+ 'tf_efficientnet_b3_ns', 'tf_efficientnet_b4_ns', 'tf_efficientnet_b5_ns', 'tf_efficientnet_b6_ns',
+ 'tf_efficientnet_b7_ns', 'tf_efficientnet_l2_ns', 'tf_efficientnet_l2_ns_475',
+ 'tf_efficientnet_es', 'tf_efficientnet_em', 'tf_efficientnet_el',
+ 'tf_efficientnet_cc_b0_4e', 'tf_efficientnet_cc_b0_8e', 'tf_efficientnet_cc_b1_8e',
+ 'tf_efficientnet_lite0', 'tf_efficientnet_lite1', 'tf_efficientnet_lite2', 'tf_efficientnet_lite3',
+ 'tf_efficientnet_lite4',
+ 'mixnet_s', 'mixnet_m', 'mixnet_l', 'mixnet_xl', 'tf_mixnet_s', 'tf_mixnet_m', 'tf_mixnet_l']
+
+
+model_urls = {
+ 'mnasnet_050': None,
+ 'mnasnet_075': None,
+ 'mnasnet_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth',
+ 'mnasnet_140': None,
+ 'mnasnet_small': None,
+
+ 'semnasnet_050': None,
+ 'semnasnet_075': None,
+ 'semnasnet_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth',
+ 'semnasnet_140': None,
+
+ 'mobilenetv2_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth',
+ 'mobilenetv2_110d':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth',
+ 'mobilenetv2_120d':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth',
+ 'mobilenetv2_140':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth',
+
+ 'fbnetc_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth',
+ 'spnasnet_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth',
+
+ 'efficientnet_b0':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth',
+ 'efficientnet_b1':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth',
+ 'efficientnet_b2':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
+ 'efficientnet_b3':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
+ 'efficientnet_b4': None,
+ 'efficientnet_b5': None,
+ 'efficientnet_b6': None,
+ 'efficientnet_b7': None,
+ 'efficientnet_b8': None,
+ 'efficientnet_l2': None,
+
+ 'efficientnet_es':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth',
+ 'efficientnet_em': None,
+ 'efficientnet_el': None,
+
+ 'efficientnet_cc_b0_4e': None,
+ 'efficientnet_cc_b0_8e': None,
+ 'efficientnet_cc_b1_8e': None,
+
+ 'efficientnet_lite0': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth',
+ 'efficientnet_lite1': None,
+ 'efficientnet_lite2': None,
+ 'efficientnet_lite3': None,
+ 'efficientnet_lite4': None,
+
+ 'tf_efficientnet_b0':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth',
+ 'tf_efficientnet_b1':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth',
+ 'tf_efficientnet_b2':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth',
+ 'tf_efficientnet_b3':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth',
+ 'tf_efficientnet_b4':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth',
+ 'tf_efficientnet_b5':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth',
+ 'tf_efficientnet_b6':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth',
+ 'tf_efficientnet_b7':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
+ 'tf_efficientnet_b8':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth',
+
+ 'tf_efficientnet_b0_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth',
+ 'tf_efficientnet_b1_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth',
+ 'tf_efficientnet_b2_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth',
+ 'tf_efficientnet_b3_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth',
+ 'tf_efficientnet_b4_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth',
+ 'tf_efficientnet_b5_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth',
+ 'tf_efficientnet_b6_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth',
+ 'tf_efficientnet_b7_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth',
+ 'tf_efficientnet_b8_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth',
+
+ 'tf_efficientnet_b0_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth',
+ 'tf_efficientnet_b1_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth',
+ 'tf_efficientnet_b2_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth',
+ 'tf_efficientnet_b3_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth',
+ 'tf_efficientnet_b4_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth',
+ 'tf_efficientnet_b5_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth',
+ 'tf_efficientnet_b6_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth',
+ 'tf_efficientnet_b7_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth',
+ 'tf_efficientnet_l2_ns_475':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth',
+ 'tf_efficientnet_l2_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
+
+ 'tf_efficientnet_es':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
+ 'tf_efficientnet_em':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth',
+ 'tf_efficientnet_el':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth',
+
+ 'tf_efficientnet_cc_b0_4e':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth',
+ 'tf_efficientnet_cc_b0_8e':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth',
+ 'tf_efficientnet_cc_b1_8e':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth',
+
+ 'tf_efficientnet_lite0':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth',
+ 'tf_efficientnet_lite1':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth',
+ 'tf_efficientnet_lite2':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth',
+ 'tf_efficientnet_lite3':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth',
+ 'tf_efficientnet_lite4':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth',
+
+ 'mixnet_s': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth',
+ 'mixnet_m': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth',
+ 'mixnet_l': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth',
+ 'mixnet_xl': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth',
+
+ 'tf_mixnet_s':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth',
+ 'tf_mixnet_m':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth',
+ 'tf_mixnet_l':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth',
+}
+
+
+class GenEfficientNet(nn.Module):
+ """ Generic EfficientNets
+
+ An implementation of mobile optimized networks that covers:
+ * EfficientNet (B0-B8, L2, CondConv, EdgeTPU)
+ * MixNet (Small, Medium, and Large, XL)
+ * MNASNet A1, B1, and small
+ * FBNet C
+ * Single-Path NAS Pixel1
+ """
+
+ def __init__(self, block_args, num_classes=1000, in_chans=3, num_features=1280, stem_size=32, fix_stem=False,
+ channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+ pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+ se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+ weight_init='goog'):
+ super(GenEfficientNet, self).__init__()
+ self.drop_rate = drop_rate
+
+ if not fix_stem:
+ stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+ self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+ self.bn1 = norm_layer(stem_size, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+ in_chs = stem_size
+
+ builder = EfficientNetBuilder(
+ channel_multiplier, channel_divisor, channel_min,
+ pad_type, act_layer, se_kwargs, norm_layer, norm_kwargs, drop_connect_rate)
+ self.blocks = nn.Sequential(*builder(in_chs, block_args))
+ in_chs = builder.in_chs
+
+ self.conv_head = select_conv2d(in_chs, num_features, 1, padding=pad_type)
+ self.bn2 = norm_layer(num_features, **norm_kwargs)
+ self.act2 = act_layer(inplace=True)
+ self.global_pool = nn.AdaptiveAvgPool2d(1)
+ self.classifier = nn.Linear(num_features, num_classes)
+
+ for n, m in self.named_modules():
+ if weight_init == 'goog':
+ initialize_weight_goog(m, n)
+ else:
+ initialize_weight_default(m, n)
+
+ def features(self, x):
+ x = self.conv_stem(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+ x = self.blocks(x)
+ x = self.conv_head(x)
+ x = self.bn2(x)
+ x = self.act2(x)
+ return x
+
+ def as_sequential(self):
+ layers = [self.conv_stem, self.bn1, self.act1]
+ layers.extend(self.blocks)
+ layers.extend([
+ self.conv_head, self.bn2, self.act2,
+ self.global_pool, nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ x = self.features(x)
+ x = self.global_pool(x)
+ x = x.flatten(1)
+ if self.drop_rate > 0.:
+ x = F.dropout(x, p=self.drop_rate, training=self.training)
+ return self.classifier(x)
+
+
+def _create_model(model_kwargs, variant, pretrained=False):
+ as_sequential = model_kwargs.pop('as_sequential', False)
+ model = GenEfficientNet(**model_kwargs)
+ if pretrained:
+ load_pretrained(model, model_urls[variant])
+ if as_sequential:
+ model = model.as_sequential()
+ return model
+
+
+def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a mnasnet-a1 model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+ Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16_noskip'],
+ # stage 1, 112x112 in
+ ['ir_r2_k3_s2_e6_c24'],
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40_se0.25'],
+ # stage 3, 28x28 in
+ ['ir_r4_k3_s2_e6_c80'],
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112_se0.25'],
+ # stage 5, 14x14in
+ ['ir_r3_k5_s2_e6_c160_se0.25'],
+ # stage 6, 7x7 in
+ ['ir_r1_k3_s1_e6_c320'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a mnasnet-b1 model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+ Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_c16_noskip'],
+ # stage 1, 112x112 in
+ ['ir_r3_k3_s2_e3_c24'],
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40'],
+ # stage 3, 28x28 in
+ ['ir_r3_k5_s2_e6_c80'],
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c96'],
+ # stage 5, 14x14in
+ ['ir_r4_k5_s2_e6_c192'],
+ # stage 6, 7x7 in
+ ['ir_r1_k3_s1_e6_c320_noskip']
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a mnasnet-b1 model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+ Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_c8'],
+ ['ir_r1_k3_s2_e3_c16'],
+ ['ir_r2_k3_s2_e6_c16'],
+ ['ir_r4_k5_s2_e6_c32_se0.25'],
+ ['ir_r3_k3_s1_e6_c32_se0.25'],
+ ['ir_r3_k5_s2_e6_c88_se0.25'],
+ ['ir_r1_k3_s1_e6_c144']
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=8,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mobilenet_v2(
+ variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs):
+ """ Generate MobileNet-V2 network
+ Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
+ Paper: https://arxiv.org/abs/1801.04381
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_c16'],
+ ['ir_r2_k3_s2_e6_c24'],
+ ['ir_r3_k3_s2_e6_c32'],
+ ['ir_r4_k3_s2_e6_c64'],
+ ['ir_r3_k3_s1_e6_c96'],
+ ['ir_r3_k3_s2_e6_c160'],
+ ['ir_r1_k3_s1_e6_c320'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head),
+ num_features=1280 if fix_stem_head else round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ fix_stem=fix_stem_head,
+ channel_multiplier=channel_multiplier,
+ norm_kwargs=resolve_bn_args(kwargs),
+ act_layer=nn.ReLU6,
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """ FBNet-C
+
+ Paper: https://arxiv.org/abs/1812.03443
+ Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
+
+ NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper,
+ it was used to confirm some building block details
+ """
+ arch_def = [
+ ['ir_r1_k3_s1_e1_c16'],
+ ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'],
+ ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'],
+ ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'],
+ ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'],
+ ['ir_r4_k5_s2_e6_c184'],
+ ['ir_r1_k3_s1_e6_c352'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=16,
+ num_features=1984, # paper suggests this, but is not 100% clear
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
+
+ Paper: https://arxiv.org/abs/1904.02877
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_c16_noskip'],
+ # stage 1, 112x112 in
+ ['ir_r3_k3_s2_e3_c24'],
+ # stage 2, 56x56 in
+ ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'],
+ # stage 3, 28x28 in
+ ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'],
+ # stage 4, 14x14in
+ ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'],
+ # stage 5, 14x14in
+ ['ir_r4_k5_s2_e6_c192'],
+ # stage 6, 7x7 in
+ ['ir_r1_k3_s1_e6_c320_noskip']
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates an EfficientNet model.
+
+ Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+ Paper: https://arxiv.org/abs/1905.11946
+
+ EfficientNet params
+ name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+ 'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+ 'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+ 'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+ 'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+ 'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+ 'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+ 'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+ 'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+ 'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer
+ depth_multiplier: multiplier to number of repeats per stage
+
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_e1_c16_se0.25'],
+ ['ir_r2_k3_s2_e6_c24_se0.25'],
+ ['ir_r2_k5_s2_e6_c40_se0.25'],
+ ['ir_r3_k3_s2_e6_c80_se0.25'],
+ ['ir_r3_k5_s1_e6_c112_se0.25'],
+ ['ir_r4_k5_s2_e6_c192_se0.25'],
+ ['ir_r1_k3_s1_e6_c320_se0.25'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier),
+ num_features=round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'swish'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ arch_def = [
+ # NOTE `fc` is present to override a mismatch between stem channels and in chs not
+ # present in other models
+ ['er_r1_k3_s1_e4_c24_fc24_noskip'],
+ ['er_r2_k3_s2_e8_c32'],
+ ['er_r4_k3_s2_e8_c48'],
+ ['ir_r5_k5_s2_e8_c96'],
+ ['ir_r4_k5_s1_e8_c144'],
+ ['ir_r2_k5_s2_e8_c192'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier),
+ num_features=round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet_condconv(
+ variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+ """Creates an efficientnet-condconv model."""
+ arch_def = [
+ ['ds_r1_k3_s1_e1_c16_se0.25'],
+ ['ir_r2_k3_s2_e6_c24_se0.25'],
+ ['ir_r2_k5_s2_e6_c40_se0.25'],
+ ['ir_r3_k3_s2_e6_c80_se0.25'],
+ ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+ ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+ ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+ num_features=round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'swish'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates an EfficientNet-Lite model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite
+ Paper: https://arxiv.org/abs/1905.11946
+
+ EfficientNet params
+ name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+ 'efficientnet-lite0': (1.0, 1.0, 224, 0.2),
+ 'efficientnet-lite1': (1.0, 1.1, 240, 0.2),
+ 'efficientnet-lite2': (1.1, 1.2, 260, 0.3),
+ 'efficientnet-lite3': (1.2, 1.4, 280, 0.3),
+ 'efficientnet-lite4': (1.4, 1.8, 300, 0.3),
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer
+ depth_multiplier: multiplier to number of repeats per stage
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_e1_c16'],
+ ['ir_r2_k3_s2_e6_c24'],
+ ['ir_r2_k5_s2_e6_c40'],
+ ['ir_r3_k3_s2_e6_c80'],
+ ['ir_r3_k5_s1_e6_c112'],
+ ['ir_r4_k5_s2_e6_c192'],
+ ['ir_r1_k3_s1_e6_c320'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True),
+ num_features=1280,
+ stem_size=32,
+ fix_stem=True,
+ channel_multiplier=channel_multiplier,
+ act_layer=nn.ReLU6,
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MixNet Small model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+ Paper: https://arxiv.org/abs/1907.09595
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'], # relu
+ # stage 2, 56x56 in
+ ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish
+ # stage 3, 28x28 in
+ ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'], # swish
+ # stage 4, 14x14in
+ ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish
+ # stage 5, 14x14in
+ ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish
+ # 7x7
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ num_features=1536,
+ stem_size=16,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MixNet Medium-Large model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+ Paper: https://arxiv.org/abs/1907.09595
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c24'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'], # relu
+ # stage 2, 56x56 in
+ ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish
+ # stage 3, 28x28 in
+ ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'], # swish
+ # stage 4, 14x14in
+ ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish
+ # stage 5, 14x14in
+ ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish
+ # 7x7
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+ num_features=1536,
+ stem_size=24,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def mnasnet_050(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 0.5. """
+ model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_075(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 0.75. """
+ model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_100(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 1.0. """
+ model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_b1(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 1.0. """
+ return mnasnet_100(pretrained, **kwargs)
+
+
+def mnasnet_140(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 1.4 """
+ model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def semnasnet_050(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """
+ model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+ return model
+
+
+def semnasnet_075(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 0.75. """
+ model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def semnasnet_100(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+ model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_a1(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+ return semnasnet_100(pretrained, **kwargs)
+
+
+def semnasnet_140(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """
+ model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_small(pretrained=False, **kwargs):
+ """ MNASNet Small, depth multiplier of 1.0. """
+ model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_100(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.0 channel multiplier """
+ model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_140(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.4 channel multiplier """
+ model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_110d(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers"""
+ model = _gen_mobilenet_v2(
+ 'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_120d(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """
+ model = _gen_mobilenet_v2(
+ 'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs)
+ return model
+
+
+def fbnetc_100(pretrained=False, **kwargs):
+ """ FBNet-C """
+ if pretrained:
+ # pretrained model trained with non-default BN epsilon
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def spnasnet_100(pretrained=False, **kwargs):
+ """ Single-Path NAS Pixel1"""
+ model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b0(pretrained=False, **kwargs):
+ """ EfficientNet-B0 """
+ # NOTE for train set drop_rate=0.2, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b1(pretrained=False, **kwargs):
+ """ EfficientNet-B1 """
+ # NOTE for train set drop_rate=0.2, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b2(pretrained=False, **kwargs):
+ """ EfficientNet-B2 """
+ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b3(pretrained=False, **kwargs):
+ """ EfficientNet-B3 """
+ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b4(pretrained=False, **kwargs):
+ """ EfficientNet-B4 """
+ # NOTE for train set drop_rate=0.4, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b5(pretrained=False, **kwargs):
+ """ EfficientNet-B5 """
+ # NOTE for train set drop_rate=0.4, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b6(pretrained=False, **kwargs):
+ """ EfficientNet-B6 """
+ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b7(pretrained=False, **kwargs):
+ """ EfficientNet-B7 """
+ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b8(pretrained=False, **kwargs):
+ """ EfficientNet-B8 """
+ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_l2(pretrained=False, **kwargs):
+ """ EfficientNet-L2. """
+ # NOTE for train, drop_rate should be 0.5
+ model = _gen_efficientnet(
+ 'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_es(pretrained=False, **kwargs):
+ """ EfficientNet-Edge Small. """
+ model = _gen_efficientnet_edge(
+ 'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_em(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Medium. """
+ model = _gen_efficientnet_edge(
+ 'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_el(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Large. """
+ model = _gen_efficientnet_edge(
+ 'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 8 Experts """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_efficientnet_condconv(
+ 'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 8 Experts """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_efficientnet_condconv(
+ 'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B1 w/ 8 Experts """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_efficientnet_condconv(
+ 'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite0(pretrained=False, **kwargs):
+ """ EfficientNet-Lite0 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite1(pretrained=False, **kwargs):
+ """ EfficientNet-Lite1 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite2(pretrained=False, **kwargs):
+ """ EfficientNet-Lite2 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite3(pretrained=False, **kwargs):
+ """ EfficientNet-Lite3 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite4(pretrained=False, **kwargs):
+ """ EfficientNet-Lite4 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b0(pretrained=False, **kwargs):
+ """ EfficientNet-B0 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b1(pretrained=False, **kwargs):
+ """ EfficientNet-B1 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b2(pretrained=False, **kwargs):
+ """ EfficientNet-B2 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b3(pretrained=False, **kwargs):
+ """ EfficientNet-B3 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b4(pretrained=False, **kwargs):
+ """ EfficientNet-B4 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b5(pretrained=False, **kwargs):
+ """ EfficientNet-B5 RandAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b6(pretrained=False, **kwargs):
+ """ EfficientNet-B6 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b7(pretrained=False, **kwargs):
+ """ EfficientNet-B7 RandAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b8(pretrained=False, **kwargs):
+ """ EfficientNet-B8 RandAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b0_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B0 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b1_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B1 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b2_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B2 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b3_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B3 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b4_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B4 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b5_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B5 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b6_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B6 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b7_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B7 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b8_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B8 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b0_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b1_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b2_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b3_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b4_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b5_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b6_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b7_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_l2_ns_475(pretrained=False, **kwargs):
+ """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_l2_ns(pretrained=False, **kwargs):
+ """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_es(pretrained=False, **kwargs):
+ """ EfficientNet-Edge Small. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_edge(
+ 'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_em(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Medium. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_edge(
+ 'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_el(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Large. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_edge(
+ 'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 4 Experts """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_condconv(
+ 'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 8 Experts """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_condconv(
+ 'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B1 w/ 8 Experts """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_condconv(
+ 'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite0(pretrained=False, **kwargs):
+ """ EfficientNet-Lite0. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite1(pretrained=False, **kwargs):
+ """ EfficientNet-Lite1. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite2(pretrained=False, **kwargs):
+ """ EfficientNet-Lite2. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite3(pretrained=False, **kwargs):
+ """ EfficientNet-Lite3. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite4(pretrained=False, **kwargs):
+ """ EfficientNet-Lite4. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_s(pretrained=False, **kwargs):
+ """Creates a MixNet Small model.
+ """
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mixnet_s(
+ 'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_m(pretrained=False, **kwargs):
+ """Creates a MixNet Medium model.
+ """
+ # NOTE for train set drop_rate=0.25
+ model = _gen_mixnet_m(
+ 'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_l(pretrained=False, **kwargs):
+ """Creates a MixNet Large model.
+ """
+ # NOTE for train set drop_rate=0.25
+ model = _gen_mixnet_m(
+ 'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_xl(pretrained=False, **kwargs):
+ """Creates a MixNet Extra-Large model.
+ Not a paper spec, experimental def by RW w/ depth scaling.
+ """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_mixnet_m(
+ 'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_xxl(pretrained=False, **kwargs):
+ """Creates a MixNet Double Extra Large model.
+ Not a paper spec, experimental def by RW w/ depth scaling.
+ """
+ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2
+ model = _gen_mixnet_m(
+ 'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mixnet_s(pretrained=False, **kwargs):
+ """Creates a MixNet Small model. Tensorflow compatible variant
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mixnet_s(
+ 'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mixnet_m(pretrained=False, **kwargs):
+ """Creates a MixNet Medium model. Tensorflow compatible variant
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mixnet_m(
+ 'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mixnet_l(pretrained=False, **kwargs):
+ """Creates a MixNet Large model. Tensorflow compatible variant
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mixnet_m(
+ 'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/helpers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9981660517a8728bc1f3f931da74ef1f1edae750
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/helpers.py
@@ -0,0 +1,71 @@
+""" Checkpoint loading / state_dict helpers
+Copyright 2020 Ross Wightman
+"""
+import torch
+import os
+from collections import OrderedDict
+try:
+ from torch.hub import load_state_dict_from_url
+except ImportError:
+ from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+
+def load_checkpoint(model, checkpoint_path):
+ if checkpoint_path and os.path.isfile(checkpoint_path):
+ print("=> Loading checkpoint '{}'".format(checkpoint_path))
+ checkpoint = torch.load(checkpoint_path)
+ if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+ new_state_dict = OrderedDict()
+ for k, v in checkpoint['state_dict'].items():
+ if k.startswith('module'):
+ name = k[7:] # remove `module.`
+ else:
+ name = k
+ new_state_dict[name] = v
+ model.load_state_dict(new_state_dict)
+ else:
+ model.load_state_dict(checkpoint)
+ print("=> Loaded checkpoint '{}'".format(checkpoint_path))
+ else:
+ print("=> Error: No checkpoint found at '{}'".format(checkpoint_path))
+ raise FileNotFoundError()
+
+
+def load_pretrained(model, url, filter_fn=None, strict=True):
+ if not url:
+ print("=> Warning: Pretrained model URL is empty, using random initialization.")
+ return
+
+ state_dict = load_state_dict_from_url(url, progress=False, map_location='cpu')
+
+ input_conv = 'conv_stem'
+ classifier = 'classifier'
+ in_chans = getattr(model, input_conv).weight.shape[1]
+ num_classes = getattr(model, classifier).weight.shape[0]
+
+ input_conv_weight = input_conv + '.weight'
+ pretrained_in_chans = state_dict[input_conv_weight].shape[1]
+ if in_chans != pretrained_in_chans:
+ if in_chans == 1:
+ print('=> Converting pretrained input conv {} from {} to 1 channel'.format(
+ input_conv_weight, pretrained_in_chans))
+ conv1_weight = state_dict[input_conv_weight]
+ state_dict[input_conv_weight] = conv1_weight.sum(dim=1, keepdim=True)
+ else:
+ print('=> Discarding pretrained input conv {} since input channel count != {}'.format(
+ input_conv_weight, pretrained_in_chans))
+ del state_dict[input_conv_weight]
+ strict = False
+
+ classifier_weight = classifier + '.weight'
+ pretrained_num_classes = state_dict[classifier_weight].shape[0]
+ if num_classes != pretrained_num_classes:
+ print('=> Discarding pretrained classifier since num_classes != {}'.format(pretrained_num_classes))
+ del state_dict[classifier_weight]
+ del state_dict[classifier + '.bias']
+ strict = False
+
+ if filter_fn is not None:
+ state_dict = filter_fn(state_dict)
+
+ model.load_state_dict(state_dict, strict=strict)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/mobilenetv3.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..4027822356ee96d9f27d7fc9156f13b5374a3a88
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/mobilenetv3.py
@@ -0,0 +1,364 @@
+""" MobileNet-V3
+
+A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl.
+
+Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .activations import get_act_fn, get_act_layer, HardSwish
+from .config import layer_config_kwargs
+from .conv2d_layers import select_conv2d
+from .helpers import load_pretrained
+from .efficientnet_builder import *
+
+__all__ = ['mobilenetv3_rw', 'mobilenetv3_large_075', 'mobilenetv3_large_100', 'mobilenetv3_large_minimal_100',
+ 'mobilenetv3_small_075', 'mobilenetv3_small_100', 'mobilenetv3_small_minimal_100',
+ 'tf_mobilenetv3_large_075', 'tf_mobilenetv3_large_100', 'tf_mobilenetv3_large_minimal_100',
+ 'tf_mobilenetv3_small_075', 'tf_mobilenetv3_small_100', 'tf_mobilenetv3_small_minimal_100']
+
+model_urls = {
+ 'mobilenetv3_rw':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
+ 'mobilenetv3_large_075': None,
+ 'mobilenetv3_large_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth',
+ 'mobilenetv3_large_minimal_100': None,
+ 'mobilenetv3_small_075': None,
+ 'mobilenetv3_small_100': None,
+ 'mobilenetv3_small_minimal_100': None,
+ 'tf_mobilenetv3_large_075':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth',
+ 'tf_mobilenetv3_large_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth',
+ 'tf_mobilenetv3_large_minimal_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth',
+ 'tf_mobilenetv3_small_075':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth',
+ 'tf_mobilenetv3_small_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth',
+ 'tf_mobilenetv3_small_minimal_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth',
+}
+
+
+class MobileNetV3(nn.Module):
+ """ MobileNet-V3
+
+ A this model utilizes the MobileNet-v3 specific 'efficient head', where global pooling is done before the
+ head convolution without a final batch-norm layer before the classifier.
+
+ Paper: https://arxiv.org/abs/1905.02244
+ """
+
+ def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
+ channel_multiplier=1.0, pad_type='', act_layer=HardSwish, drop_rate=0., drop_connect_rate=0.,
+ se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'):
+ super(MobileNetV3, self).__init__()
+ self.drop_rate = drop_rate
+
+ stem_size = round_channels(stem_size, channel_multiplier)
+ self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+ self.bn1 = nn.BatchNorm2d(stem_size, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+ in_chs = stem_size
+
+ builder = EfficientNetBuilder(
+ channel_multiplier, pad_type=pad_type, act_layer=act_layer, se_kwargs=se_kwargs,
+ norm_layer=norm_layer, norm_kwargs=norm_kwargs, drop_connect_rate=drop_connect_rate)
+ self.blocks = nn.Sequential(*builder(in_chs, block_args))
+ in_chs = builder.in_chs
+
+ self.global_pool = nn.AdaptiveAvgPool2d(1)
+ self.conv_head = select_conv2d(in_chs, num_features, 1, padding=pad_type, bias=head_bias)
+ self.act2 = act_layer(inplace=True)
+ self.classifier = nn.Linear(num_features, num_classes)
+
+ for m in self.modules():
+ if weight_init == 'goog':
+ initialize_weight_goog(m)
+ else:
+ initialize_weight_default(m)
+
+ def as_sequential(self):
+ layers = [self.conv_stem, self.bn1, self.act1]
+ layers.extend(self.blocks)
+ layers.extend([
+ self.global_pool, self.conv_head, self.act2,
+ nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+ return nn.Sequential(*layers)
+
+ def features(self, x):
+ x = self.conv_stem(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+ x = self.blocks(x)
+ x = self.global_pool(x)
+ x = self.conv_head(x)
+ x = self.act2(x)
+ return x
+
+ def forward(self, x):
+ x = self.features(x)
+ x = x.flatten(1)
+ if self.drop_rate > 0.:
+ x = F.dropout(x, p=self.drop_rate, training=self.training)
+ return self.classifier(x)
+
+
+def _create_model(model_kwargs, variant, pretrained=False):
+ as_sequential = model_kwargs.pop('as_sequential', False)
+ model = MobileNetV3(**model_kwargs)
+ if pretrained and model_urls[variant]:
+ load_pretrained(model, model_urls[variant])
+ if as_sequential:
+ model = model.as_sequential()
+ return model
+
+
+def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MobileNet-V3 model (RW variant).
+
+ Paper: https://arxiv.org/abs/1905.02244
+
+ This was my first attempt at reproducing the MobileNet-V3 from paper alone. It came close to the
+ eventual Tensorflow reference impl but has a few differences:
+ 1. This model has no bias on the head convolution
+ 2. This model forces no residual (noskip) on the first DWS block, this is different than MnasNet
+ 3. This model always uses ReLU for the SE activation layer, other models in the family inherit their act layer
+ from their parent block
+ 4. This model does not enforce divisible by 8 limitation on the SE reduction channel count
+
+ Overall the changes are fairly minor and result in a very small parameter count difference and no
+ top-1/5
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16_nre_noskip'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu
+ # stage 3, 28x28 in
+ ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish
+ # stage 5, 14x14in
+ ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c960'], # hard-swish
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ head_bias=False, # one of my mistakes
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+ se_kwargs=dict(gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MobileNet-V3 large/small/minimal models.
+
+ Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v3.py
+ Paper: https://arxiv.org/abs/1905.02244
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ if 'small' in variant:
+ num_features = 1024
+ if 'minimal' in variant:
+ act_layer = 'relu'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s2_e1_c16'],
+ # stage 1, 56x56 in
+ ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'],
+ # stage 2, 28x28 in
+ ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'],
+ # stage 3, 14x14 in
+ ['ir_r2_k3_s1_e3_c48'],
+ # stage 4, 14x14in
+ ['ir_r3_k3_s2_e6_c96'],
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c576'],
+ ]
+ else:
+ act_layer = 'hard_swish'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s2_e1_c16_se0.25_nre'], # relu
+ # stage 1, 56x56 in
+ ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'], # relu
+ # stage 2, 28x28 in
+ ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'], # hard-swish
+ # stage 3, 14x14 in
+ ['ir_r2_k5_s1_e3_c48_se0.25'], # hard-swish
+ # stage 4, 14x14in
+ ['ir_r3_k5_s2_e6_c96_se0.25'], # hard-swish
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c576'], # hard-swish
+ ]
+ else:
+ num_features = 1280
+ if 'minimal' in variant:
+ act_layer = 'relu'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16'],
+ # stage 1, 112x112 in
+ ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'],
+ # stage 2, 56x56 in
+ ['ir_r3_k3_s2_e3_c40'],
+ # stage 3, 28x28 in
+ ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112'],
+ # stage 5, 14x14in
+ ['ir_r3_k3_s2_e6_c160'],
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c960'],
+ ]
+ else:
+ act_layer = 'hard_swish'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16_nre'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu
+ # stage 3, 28x28 in
+ ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish
+ # stage 5, 14x14in
+ ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c960'], # hard-swish
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ num_features=num_features,
+ stem_size=16,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, act_layer),
+ se_kwargs=dict(
+ act_layer=get_act_layer('relu'), gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True, divisor=8),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def mobilenetv3_rw(pretrained=False, **kwargs):
+ """ MobileNet-V3 RW
+ Attn: See note in gen function for this variant.
+ """
+ # NOTE for train set drop_rate=0.2
+ if pretrained:
+ # pretrained model trained with non-default BN epsilon
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_large_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 0.75"""
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_large_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 1.0 """
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large (Minimalistic) 1.0 """
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mobilenet_v3('mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_small_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 0.75 """
+ model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_small_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 1.0 """
+ model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small (Minimalistic) 1.0 """
+ model = _gen_mobilenet_v3('mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_large_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 0.75. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_large_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 1.0. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large Minimalistic 1.0. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_small_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 0.75. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_small_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 1.0. Tensorflow compat variant."""
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small Minimalistic 1.0. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/model_factory.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/model_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd2208dc97e9d705fda7cc497b21d630ca798ecb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/model_factory.py
@@ -0,0 +1,27 @@
+from .config import set_layer_config
+from .helpers import load_checkpoint
+
+from .gen_efficientnet import *
+from .mobilenetv3 import *
+
+
+def create_model(
+ model_name='mnasnet_100',
+ pretrained=None,
+ num_classes=1000,
+ in_chans=3,
+ checkpoint_path='',
+ **kwargs):
+
+ model_kwargs = dict(num_classes=num_classes, in_chans=in_chans, pretrained=pretrained, **kwargs)
+
+ if model_name in globals():
+ create_fn = globals()[model_name]
+ model = create_fn(**model_kwargs)
+ else:
+ raise RuntimeError('Unknown model (%s)' % model_name)
+
+ if checkpoint_path and not pretrained:
+ load_checkpoint(model, checkpoint_path)
+
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/version.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..20fc277bae821bed80a29af8538a2a9273b20c41
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/geffnet/version.py
@@ -0,0 +1 @@
+__version__ = '1.0.2'
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/hubconf.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..89feac0f9699a8a35cf69b2fcd3628d9d110239b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/hubconf.py
@@ -0,0 +1,84 @@
+dependencies = ['torch', 'math']
+
+from geffnet import efficientnet_b0
+from geffnet import efficientnet_b1
+from geffnet import efficientnet_b2
+from geffnet import efficientnet_b3
+
+from geffnet import efficientnet_es
+
+from geffnet import efficientnet_lite0
+
+from geffnet import mixnet_s
+from geffnet import mixnet_m
+from geffnet import mixnet_l
+from geffnet import mixnet_xl
+
+from geffnet import mobilenetv2_100
+from geffnet import mobilenetv2_110d
+from geffnet import mobilenetv2_120d
+from geffnet import mobilenetv2_140
+
+from geffnet import mobilenetv3_large_100
+from geffnet import mobilenetv3_rw
+from geffnet import mnasnet_a1
+from geffnet import mnasnet_b1
+from geffnet import fbnetc_100
+from geffnet import spnasnet_100
+
+from geffnet import tf_efficientnet_b0
+from geffnet import tf_efficientnet_b1
+from geffnet import tf_efficientnet_b2
+from geffnet import tf_efficientnet_b3
+from geffnet import tf_efficientnet_b4
+from geffnet import tf_efficientnet_b5
+from geffnet import tf_efficientnet_b6
+from geffnet import tf_efficientnet_b7
+from geffnet import tf_efficientnet_b8
+
+from geffnet import tf_efficientnet_b0_ap
+from geffnet import tf_efficientnet_b1_ap
+from geffnet import tf_efficientnet_b2_ap
+from geffnet import tf_efficientnet_b3_ap
+from geffnet import tf_efficientnet_b4_ap
+from geffnet import tf_efficientnet_b5_ap
+from geffnet import tf_efficientnet_b6_ap
+from geffnet import tf_efficientnet_b7_ap
+from geffnet import tf_efficientnet_b8_ap
+
+from geffnet import tf_efficientnet_b0_ns
+from geffnet import tf_efficientnet_b1_ns
+from geffnet import tf_efficientnet_b2_ns
+from geffnet import tf_efficientnet_b3_ns
+from geffnet import tf_efficientnet_b4_ns
+from geffnet import tf_efficientnet_b5_ns
+from geffnet import tf_efficientnet_b6_ns
+from geffnet import tf_efficientnet_b7_ns
+from geffnet import tf_efficientnet_l2_ns_475
+from geffnet import tf_efficientnet_l2_ns
+
+from geffnet import tf_efficientnet_es
+from geffnet import tf_efficientnet_em
+from geffnet import tf_efficientnet_el
+
+from geffnet import tf_efficientnet_cc_b0_4e
+from geffnet import tf_efficientnet_cc_b0_8e
+from geffnet import tf_efficientnet_cc_b1_8e
+
+from geffnet import tf_efficientnet_lite0
+from geffnet import tf_efficientnet_lite1
+from geffnet import tf_efficientnet_lite2
+from geffnet import tf_efficientnet_lite3
+from geffnet import tf_efficientnet_lite4
+
+from geffnet import tf_mixnet_s
+from geffnet import tf_mixnet_m
+from geffnet import tf_mixnet_l
+
+from geffnet import tf_mobilenetv3_large_075
+from geffnet import tf_mobilenetv3_large_100
+from geffnet import tf_mobilenetv3_large_minimal_100
+from geffnet import tf_mobilenetv3_small_075
+from geffnet import tf_mobilenetv3_small_100
+from geffnet import tf_mobilenetv3_small_minimal_100
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_export.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..af3ed8993bfa41190b5066a044efbe53b45f4c04
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_export.py
@@ -0,0 +1,120 @@
+""" ONNX export script
+
+Export PyTorch models as ONNX graphs.
+
+This export script originally started as an adaptation of code snippets found at
+https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html
+
+The default parameters work with PyTorch 1.6 and ONNX 1.7 and produce an optimal ONNX graph
+for hosting in the ONNX runtime (see onnx_validate.py). To export an ONNX model compatible
+with caffe2 (see caffe2_benchmark.py and caffe2_validate.py), the --keep-init and --aten-fallback
+flags are currently required.
+
+Older versions of PyTorch/ONNX (tested PyTorch 1.4, ONNX 1.5) do not need extra flags for
+caffe2 compatibility, but they produce a model that isn't as fast running on ONNX runtime.
+
+Most new release of PyTorch and ONNX cause some sort of breakage in the export / usage of ONNX models.
+Please do your research and search ONNX and PyTorch issue tracker before asking me. Thanks.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import torch
+import numpy as np
+
+import onnx
+import geffnet
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
+parser.add_argument('output', metavar='ONNX_FILE',
+ help='output model filename')
+parser.add_argument('--model', '-m', metavar='MODEL', default='mobilenetv3_large_100',
+ help='model architecture (default: mobilenetv3_large_100)')
+parser.add_argument('--opset', type=int, default=10,
+ help='ONNX opset to use (default: 10)')
+parser.add_argument('--keep-init', action='store_true', default=False,
+ help='Keep initializers as input. Needed for Caffe2 compatible export in newer PyTorch/ONNX.')
+parser.add_argument('--aten-fallback', action='store_true', default=False,
+ help='Fallback to ATEN ops. Helps fix AdaptiveAvgPool issue with Caffe2 in newer PyTorch/ONNX.')
+parser.add_argument('--dynamic-size', action='store_true', default=False,
+ help='Export model width dynamic width/height. Not recommended for "tf" models with SAME padding.')
+parser.add_argument('-b', '--batch-size', default=1, type=int,
+ metavar='N', help='mini-batch size (default: 1)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--num-classes', type=int, default=1000,
+ help='Number classes in dataset')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+ help='path to checkpoint (default: none)')
+
+
+def main():
+ args = parser.parse_args()
+
+ args.pretrained = True
+ if args.checkpoint:
+ args.pretrained = False
+
+ print("==> Creating PyTorch {} model".format(args.model))
+ # NOTE exportable=True flag disables autofn/jit scripted activations and uses Conv2dSameExport layers
+ # for models using SAME padding
+ model = geffnet.create_model(
+ args.model,
+ num_classes=args.num_classes,
+ in_chans=3,
+ pretrained=args.pretrained,
+ checkpoint_path=args.checkpoint,
+ exportable=True)
+
+ model.eval()
+
+ example_input = torch.randn((args.batch_size, 3, args.img_size or 224, args.img_size or 224), requires_grad=True)
+
+ # Run model once before export trace, sets padding for models with Conv2dSameExport. This means
+ # that the padding for models with Conv2dSameExport (most models with tf_ prefix) is fixed for
+ # the input img_size specified in this script.
+ # Opset >= 11 should allow for dynamic padding, however I cannot get it to work due to
+ # issues in the tracing of the dynamic padding or errors attempting to export the model after jit
+ # scripting it (an approach that should work). Perhaps in a future PyTorch or ONNX versions...
+ model(example_input)
+
+ print("==> Exporting model to ONNX format at '{}'".format(args.output))
+ input_names = ["input0"]
+ output_names = ["output0"]
+ dynamic_axes = {'input0': {0: 'batch'}, 'output0': {0: 'batch'}}
+ if args.dynamic_size:
+ dynamic_axes['input0'][2] = 'height'
+ dynamic_axes['input0'][3] = 'width'
+ if args.aten_fallback:
+ export_type = torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+ else:
+ export_type = torch.onnx.OperatorExportTypes.ONNX
+
+ torch_out = torch.onnx._export(
+ model, example_input, args.output, export_params=True, verbose=True, input_names=input_names,
+ output_names=output_names, keep_initializers_as_inputs=args.keep_init, dynamic_axes=dynamic_axes,
+ opset_version=args.opset, operator_export_type=export_type)
+
+ print("==> Loading and checking exported model from '{}'".format(args.output))
+ onnx_model = onnx.load(args.output)
+ onnx.checker.check_model(onnx_model) # assuming throw on error
+ print("==> Passed")
+
+ if args.keep_init and args.aten_fallback:
+ import caffe2.python.onnx.backend as onnx_caffe2
+ # Caffe2 loading only works properly in newer PyTorch/ONNX combos when
+ # keep_initializers_as_inputs and aten_fallback are set to True.
+ print("==> Loading model into Caffe2 backend and comparing forward pass.".format(args.output))
+ caffe2_backend = onnx_caffe2.prepare(onnx_model)
+ B = {onnx_model.graph.input[0].name: x.data.numpy()}
+ c2_out = caffe2_backend.run(B)[0]
+ np.testing.assert_almost_equal(torch_out.data.numpy(), c2_out, decimal=5)
+ print("==> Passed")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_optimize.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_optimize.py
new file mode 100644
index 0000000000000000000000000000000000000000..85abc534efd08d5fb51881954ef43b8480561824
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_optimize.py
@@ -0,0 +1,84 @@
+""" ONNX optimization script
+
+Run ONNX models through the optimizer to prune unneeded nodes, fuse batchnorm layers into conv, etc.
+
+NOTE: This isn't working consistently in recent PyTorch/ONNX combos (ie PyTorch 1.6 and ONNX 1.7),
+it seems time to switch to using the onnxruntime online optimizer (can also be saved for offline).
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import warnings
+
+import onnx
+from onnx import optimizer
+
+
+parser = argparse.ArgumentParser(description="Optimize ONNX model")
+
+parser.add_argument("model", help="The ONNX model")
+parser.add_argument("--output", required=True, help="The optimized model output filename")
+
+
+def traverse_graph(graph, prefix=''):
+ content = []
+ indent = prefix + ' '
+ graphs = []
+ num_nodes = 0
+ for node in graph.node:
+ pn, gs = onnx.helper.printable_node(node, indent, subgraphs=True)
+ assert isinstance(gs, list)
+ content.append(pn)
+ graphs.extend(gs)
+ num_nodes += 1
+ for g in graphs:
+ g_count, g_str = traverse_graph(g)
+ content.append('\n' + g_str)
+ num_nodes += g_count
+ return num_nodes, '\n'.join(content)
+
+
+def main():
+ args = parser.parse_args()
+ onnx_model = onnx.load(args.model)
+ num_original_nodes, original_graph_str = traverse_graph(onnx_model.graph)
+
+ # Optimizer passes to perform
+ passes = [
+ #'eliminate_deadend',
+ 'eliminate_identity',
+ 'eliminate_nop_dropout',
+ 'eliminate_nop_pad',
+ 'eliminate_nop_transpose',
+ 'eliminate_unused_initializer',
+ 'extract_constant_to_initializer',
+ 'fuse_add_bias_into_conv',
+ 'fuse_bn_into_conv',
+ 'fuse_consecutive_concats',
+ 'fuse_consecutive_reduce_unsqueeze',
+ 'fuse_consecutive_squeezes',
+ 'fuse_consecutive_transposes',
+ #'fuse_matmul_add_bias_into_gemm',
+ 'fuse_pad_into_conv',
+ #'fuse_transpose_into_gemm',
+ #'lift_lexical_references',
+ ]
+
+ # Apply the optimization on the original serialized model
+ # WARNING I've had issues with optimizer in recent versions of PyTorch / ONNX causing
+ # 'duplicate definition of name' errors, see: https://github.com/onnx/onnx/issues/2401
+ # It may be better to rely on onnxruntime optimizations, see onnx_validate.py script.
+ warnings.warn("I've had issues with optimizer in recent versions of PyTorch / ONNX."
+ "Try onnxruntime optimization if this doesn't work.")
+ optimized_model = optimizer.optimize(onnx_model, passes)
+
+ num_optimized_nodes, optimzied_graph_str = traverse_graph(optimized_model.graph)
+ print('==> The model after optimization:\n{}\n'.format(optimzied_graph_str))
+ print('==> The optimized model has {} nodes, the original had {}.'.format(num_optimized_nodes, num_original_nodes))
+
+ # Save the ONNX model
+ onnx.save(optimized_model, args.output)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_to_caffe.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_to_caffe.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fe0b0d7624ef871be586024429d058107a6f1d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_to_caffe.py
@@ -0,0 +1,27 @@
+import argparse
+
+import onnx
+from caffe2.python.onnx.backend import Caffe2Backend
+
+
+parser = argparse.ArgumentParser(description="Convert ONNX to Caffe2")
+
+parser.add_argument("model", help="The ONNX model")
+parser.add_argument("--c2-prefix", required=True,
+ help="The output file prefix for the caffe2 model init and predict file. ")
+
+
+def main():
+ args = parser.parse_args()
+ onnx_model = onnx.load(args.model)
+ caffe2_init, caffe2_predict = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
+ caffe2_init_str = caffe2_init.SerializeToString()
+ with open(args.c2_prefix + '.init.pb', "wb") as f:
+ f.write(caffe2_init_str)
+ caffe2_predict_str = caffe2_predict.SerializeToString()
+ with open(args.c2_prefix + '.predict.pb', "wb") as f:
+ f.write(caffe2_predict_str)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_validate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3736cea66b29d20e00a114d5d82d899a7dbe6f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/onnx_validate.py
@@ -0,0 +1,112 @@
+""" ONNX-runtime validation script
+
+This script was created to verify accuracy and performance of exported ONNX
+models running with the onnxruntime. It utilizes the PyTorch dataloader/processing
+pipeline for a fair comparison against the originals.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import numpy as np
+import onnxruntime
+from data import create_loader, resolve_data_config, Dataset
+from utils import AverageMeter
+import time
+
+parser = argparse.ArgumentParser(description='Caffe2 ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--onnx-input', default='', type=str, metavar='PATH',
+ help='path to onnx model/weights file')
+parser.add_argument('--onnx-output-opt', default='', type=str, metavar='PATH',
+ help='path to output optimized onnx graph')
+parser.add_argument('--profile', action='store_true', default=False,
+ help='Enable profiler output.')
+parser.add_argument('-j', '--workers', default=2, type=int, metavar='N',
+ help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+ metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT',
+ help='Override default crop pct of 0.875')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true',
+ help='use tensorflow mnasnet preporcessing')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+ metavar='N', help='print frequency (default: 10)')
+
+
+def main():
+ args = parser.parse_args()
+ args.gpu_id = 0
+
+ # Set graph optimization level
+ sess_options = onnxruntime.SessionOptions()
+ sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+ if args.profile:
+ sess_options.enable_profiling = True
+ if args.onnx_output_opt:
+ sess_options.optimized_model_filepath = args.onnx_output_opt
+
+ session = onnxruntime.InferenceSession(args.onnx_input, sess_options)
+
+ data_config = resolve_data_config(None, args)
+ loader = create_loader(
+ Dataset(args.data, load_bytes=args.tf_preprocessing),
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ use_prefetcher=False,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ crop_pct=data_config['crop_pct'],
+ tensorflow_preprocessing=args.tf_preprocessing)
+
+ input_name = session.get_inputs()[0].name
+
+ batch_time = AverageMeter()
+ top1 = AverageMeter()
+ top5 = AverageMeter()
+ end = time.time()
+ for i, (input, target) in enumerate(loader):
+ # run the net and return prediction
+ output = session.run([], {input_name: input.data.numpy()})
+ output = output[0]
+
+ # measure accuracy and record loss
+ prec1, prec5 = accuracy_np(output, target.numpy())
+ top1.update(prec1.item(), input.size(0))
+ top5.update(prec5.item(), input.size(0))
+
+ # measure elapsed time
+ batch_time.update(time.time() - end)
+ end = time.time()
+
+ if i % args.print_freq == 0:
+ print('Test: [{0}/{1}]\t'
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s, {ms_avg:.3f} ms/sample) \t'
+ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+ 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+ i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg,
+ ms_avg=100 * batch_time.avg / input.size(0), top1=top1, top5=top5))
+
+ print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format(
+ top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg))
+
+
+def accuracy_np(output, target):
+ max_indices = np.argsort(output, axis=1)[:, ::-1]
+ top5 = 100 * np.equal(max_indices[:, :5], target[:, np.newaxis]).sum(axis=1).mean()
+ top1 = 100 * np.equal(max_indices[:, 0], target).mean()
+ return top1, top5
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/requirements.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a59ac4eded037190ed20c2cb66c6b8aa802b3c65
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/requirements.txt
@@ -0,0 +1,2 @@
+torch>=1.2.0
+torchvision>=0.4.0
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/setup.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d7c6276e4073b6de7f3ec43ffa01e614e14bd97
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/setup.py
@@ -0,0 +1,47 @@
+""" Setup
+"""
+from setuptools import setup, find_packages
+from codecs import open
+from os import path
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the README file
+with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+ long_description = f.read()
+
+exec(open('geffnet/version.py').read())
+setup(
+ name='geffnet',
+ version=__version__,
+ description='(Generic) EfficientNets for PyTorch',
+ long_description=long_description,
+ long_description_content_type='text/markdown',
+ url='https://github.com/rwightman/gen-efficientnet-pytorch',
+ author='Ross Wightman',
+ author_email='hello@rwightman.com',
+ classifiers=[
+ # How mature is this project? Common values are
+ # 3 - Alpha
+ # 4 - Beta
+ # 5 - Production/Stable
+ 'Development Status :: 3 - Alpha',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Science/Research',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Topic :: Scientific/Engineering',
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
+ 'Topic :: Software Development',
+ 'Topic :: Software Development :: Libraries',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ ],
+
+ # Note that this is a string of words separated by whitespace, not a list.
+ keywords='pytorch pretrained models efficientnet mixnet mobilenetv3 mnasnet',
+ packages=find_packages(exclude=['data']),
+ install_requires=['torch >= 1.4', 'torchvision'],
+ python_requires='>=3.6',
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eba7616037b08488c795563d0aa37e73a67a878
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/utils.py
@@ -0,0 +1,52 @@
+import os
+
+
+class AverageMeter:
+ """Computes and stores the average and current value"""
+ def __init__(self):
+ self.reset()
+
+ def reset(self):
+ self.val = 0
+ self.avg = 0
+ self.sum = 0
+ self.count = 0
+
+ def update(self, val, n=1):
+ self.val = val
+ self.sum += val * n
+ self.count += n
+ self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+ """Computes the precision@k for the specified values of k"""
+ maxk = max(topk)
+ batch_size = target.size(0)
+
+ _, pred = output.topk(maxk, 1, True, True)
+ pred = pred.t()
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+ res = []
+ for k in topk:
+ correct_k = correct[:k].reshape(-1).float().sum(0)
+ res.append(correct_k.mul_(100.0 / batch_size))
+ return res
+
+
+def get_outdir(path, *paths, inc=False):
+ outdir = os.path.join(path, *paths)
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+ elif inc:
+ count = 1
+ outdir_inc = outdir + '-' + str(count)
+ while os.path.exists(outdir_inc):
+ count = count + 1
+ outdir_inc = outdir + '-' + str(count)
+ assert count < 100
+ outdir = outdir_inc
+ os.makedirs(outdir)
+ return outdir
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/validate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced583562887e458790a57c70d4e57ffa36c4955
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/models/submodules/efficientnet_repo/validate.py
@@ -0,0 +1,166 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+from contextlib import suppress
+
+import geffnet
+from data import Dataset, create_loader, resolve_data_config
+from utils import accuracy, AverageMeter
+
+has_native_amp = False
+try:
+ if getattr(torch.cuda.amp, 'autocast') is not None:
+ has_native_amp = True
+except AttributeError:
+ pass
+
+torch.backends.cudnn.benchmark = True
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--model', '-m', metavar='MODEL', default='spnasnet1_00',
+ help='model architecture (default: dpn92)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+ help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+ metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT',
+ help='Override default crop pct of 0.875')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('--num-classes', type=int, default=1000,
+ help='Number classes in dataset')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+ metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+ help='path to latest checkpoint (default: none)')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+ help='use pre-trained model')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+ help='convert model torchscript for inference')
+parser.add_argument('--num-gpu', type=int, default=1,
+ help='Number of GPUS to use')
+parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true',
+ help='use tensorflow mnasnet preporcessing')
+parser.add_argument('--no-cuda', dest='no_cuda', action='store_true',
+ help='')
+parser.add_argument('--channels-last', action='store_true', default=False,
+ help='Use channels_last memory layout')
+parser.add_argument('--amp', action='store_true', default=False,
+ help='Use native Torch AMP mixed precision.')
+
+
+def main():
+ args = parser.parse_args()
+
+ if not args.checkpoint and not args.pretrained:
+ args.pretrained = True
+
+ amp_autocast = suppress # do nothing
+ if args.amp:
+ if not has_native_amp:
+ print("Native Torch AMP is not available (requires torch >= 1.6), using FP32.")
+ else:
+ amp_autocast = torch.cuda.amp.autocast
+
+ # create model
+ model = geffnet.create_model(
+ args.model,
+ num_classes=args.num_classes,
+ in_chans=3,
+ pretrained=args.pretrained,
+ checkpoint_path=args.checkpoint,
+ scriptable=args.torchscript)
+
+ if args.channels_last:
+ model = model.to(memory_format=torch.channels_last)
+
+ if args.torchscript:
+ torch.jit.optimized_execution(True)
+ model = torch.jit.script(model)
+
+ print('Model %s created, param count: %d' %
+ (args.model, sum([m.numel() for m in model.parameters()])))
+
+ data_config = resolve_data_config(model, args)
+
+ criterion = nn.CrossEntropyLoss()
+
+ if not args.no_cuda:
+ if args.num_gpu > 1:
+ model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda()
+ else:
+ model = model.cuda()
+ criterion = criterion.cuda()
+
+ loader = create_loader(
+ Dataset(args.data, load_bytes=args.tf_preprocessing),
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ use_prefetcher=not args.no_cuda,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ crop_pct=data_config['crop_pct'],
+ tensorflow_preprocessing=args.tf_preprocessing)
+
+ batch_time = AverageMeter()
+ losses = AverageMeter()
+ top1 = AverageMeter()
+ top5 = AverageMeter()
+
+ model.eval()
+ end = time.time()
+ with torch.no_grad():
+ for i, (input, target) in enumerate(loader):
+ if not args.no_cuda:
+ target = target.cuda()
+ input = input.cuda()
+ if args.channels_last:
+ input = input.contiguous(memory_format=torch.channels_last)
+
+ # compute output
+ with amp_autocast():
+ output = model(input)
+ loss = criterion(output, target)
+
+ # measure accuracy and record loss
+ prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+ losses.update(loss.item(), input.size(0))
+ top1.update(prec1.item(), input.size(0))
+ top5.update(prec5.item(), input.size(0))
+
+ # measure elapsed time
+ batch_time.update(time.time() - end)
+ end = time.time()
+
+ if i % args.print_freq == 0:
+ print('Test: [{0}/{1}]\t'
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t'
+ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+ 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+ i, len(loader), batch_time=batch_time,
+ rate_avg=input.size(0) / batch_time.avg,
+ loss=losses, top1=top1, top5=top5))
+
+ print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format(
+ top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/utils/rotation.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/utils/rotation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecbc7fd93aae75b3388f0d37132a3ee679a25a05
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/utils/rotation.py
@@ -0,0 +1,85 @@
+import torch
+import numpy as np
+
+
+# NOTE: from PyTorch3D
+def axis_angle_to_quaternion(axis_angle: torch.Tensor) -> torch.Tensor:
+ """
+ Convert rotations given as axis/angle to quaternions.
+
+ Args:
+ axis_angle: Rotations given as a vector in axis angle form,
+ as a tensor of shape (..., 3), where the magnitude is
+ the angle turned anticlockwise in radians around the
+ vector's direction.
+
+ Returns:
+ quaternions with real part first, as tensor of shape (..., 4).
+ """
+ angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+ half_angles = angles * 0.5
+ eps = 1e-6
+ small_angles = angles.abs() < eps
+ sin_half_angles_over_angles = torch.empty_like(angles)
+ sin_half_angles_over_angles[~small_angles] = (
+ torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+ )
+ # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+ # so sin(x/2)/x is about 1/2 - (x*x)/48
+ sin_half_angles_over_angles[small_angles] = (
+ 0.5 - (angles[small_angles] * angles[small_angles]) / 48
+ )
+ quaternions = torch.cat(
+ [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+ )
+ return quaternions
+
+
+# NOTE: from PyTorch3D
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+ """
+ Convert rotations given as quaternions to rotation matrices.
+
+ Args:
+ quaternions: quaternions with real part first,
+ as tensor of shape (..., 4).
+
+ Returns:
+ Rotation matrices as tensor of shape (..., 3, 3).
+ """
+ r, i, j, k = torch.unbind(quaternions, -1)
+ # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+ two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+ o = torch.stack(
+ (
+ 1 - two_s * (j * j + k * k),
+ two_s * (i * j - k * r),
+ two_s * (i * k + j * r),
+ two_s * (i * j + k * r),
+ 1 - two_s * (i * i + k * k),
+ two_s * (j * k - i * r),
+ two_s * (i * k - j * r),
+ two_s * (j * k + i * r),
+ 1 - two_s * (i * i + j * j),
+ ),
+ -1,
+ )
+ return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+# NOTE: from PyTorch3D
+def axis_angle_to_matrix(axis_angle: torch.Tensor) -> torch.Tensor:
+ """
+ Convert rotations given as axis/angle to rotation matrices.
+
+ Args:
+ axis_angle: Rotations given as a vector in axis angle form,
+ as a tensor of shape (..., 3), where the magnitude is
+ the angle turned anticlockwise in radians around the
+ vector's direction.
+
+ Returns:
+ Rotation matrices as tensor of shape (..., 3, 3).
+ """
+ return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/utils/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..70193d015ca800c28d089defc22d9ef04e07b785
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dsine/utils/utils.py
@@ -0,0 +1,105 @@
+""" utils
+"""
+import os
+import torch
+import numpy as np
+
+
+def load_checkpoint(fpath, model):
+ print('loading checkpoint... {}'.format(fpath))
+
+ ckpt = torch.load(fpath, map_location='cpu')['model']
+
+ load_dict = {}
+ for k, v in ckpt.items():
+ if k.startswith('module.'):
+ k_ = k.replace('module.', '')
+ load_dict[k_] = v
+ else:
+ load_dict[k] = v
+
+ model.load_state_dict(load_dict)
+ print('loading checkpoint... / done')
+ return model
+
+
+def compute_normal_error(pred_norm, gt_norm):
+ pred_error = torch.cosine_similarity(pred_norm, gt_norm, dim=1)
+ pred_error = torch.clamp(pred_error, min=-1.0, max=1.0)
+ pred_error = torch.acos(pred_error) * 180.0 / np.pi
+ pred_error = pred_error.unsqueeze(1) # (B, 1, H, W)
+ return pred_error
+
+
+def compute_normal_metrics(total_normal_errors):
+ total_normal_errors = total_normal_errors.detach().cpu().numpy()
+ num_pixels = total_normal_errors.shape[0]
+
+ metrics = {
+ 'mean': np.average(total_normal_errors),
+ 'median': np.median(total_normal_errors),
+ 'rmse': np.sqrt(np.sum(total_normal_errors * total_normal_errors) / num_pixels),
+ 'a1': 100.0 * (np.sum(total_normal_errors < 5) / num_pixels),
+ 'a2': 100.0 * (np.sum(total_normal_errors < 7.5) / num_pixels),
+ 'a3': 100.0 * (np.sum(total_normal_errors < 11.25) / num_pixels),
+ 'a4': 100.0 * (np.sum(total_normal_errors < 22.5) / num_pixels),
+ 'a5': 100.0 * (np.sum(total_normal_errors < 30) / num_pixels)
+ }
+
+ return metrics
+
+
+def pad_input(orig_H, orig_W):
+ if orig_W % 32 == 0:
+ l = 0
+ r = 0
+ else:
+ new_W = 32 * ((orig_W // 32) + 1)
+ l = (new_W - orig_W) // 2
+ r = (new_W - orig_W) - l
+
+ if orig_H % 32 == 0:
+ t = 0
+ b = 0
+ else:
+ new_H = 32 * ((orig_H // 32) + 1)
+ t = (new_H - orig_H) // 2
+ b = (new_H - orig_H) - t
+ return l, r, t, b
+
+
+def get_intrins_from_fov(new_fov, H, W, device):
+ # NOTE: top-left pixel should be (0,0)
+ if W >= H:
+ new_fu = (W / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
+ new_fv = (W / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
+ else:
+ new_fu = (H / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
+ new_fv = (H / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
+
+ new_cu = (W / 2.0) - 0.5
+ new_cv = (H / 2.0) - 0.5
+
+ new_intrins = torch.tensor([
+ [new_fu, 0, new_cu ],
+ [0, new_fv, new_cv ],
+ [0, 0, 1 ]
+ ], dtype=torch.float32, device=device)
+
+ return new_intrins
+
+
+def get_intrins_from_txt(intrins_path, device):
+ # NOTE: top-left pixel should be (0,0)
+ with open(intrins_path, 'r') as f:
+ intrins_ = f.readlines()[0].split()[0].split(',')
+ intrins_ = [float(i) for i in intrins_]
+ fx, fy, cx, cy = intrins_
+
+ intrins = torch.tensor([
+ [fx, 0,cx],
+ [ 0,fy,cy],
+ [ 0, 0, 1]
+ ], dtype=torch.float32, device=device)
+
+ return intrins
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9df4f7f1b9e35ee40d387e765cebdc7d2af06a5e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/LICENSE
@@ -0,0 +1,108 @@
+OPENPOSE: MULTIPERSON KEYPOINT DETECTION
+SOFTWARE LICENSE AGREEMENT
+ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+
+BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+
+This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
+
+RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
+Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
+non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
+
+CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
+
+COPYRIGHT: The Software is owned by Licensor and is protected by United
+States copyright laws and applicable international treaties and/or conventions.
+
+PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
+
+DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
+
+BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
+
+USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
+
+You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
+
+ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
+
+TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
+
+The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
+
+FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
+
+DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
+
+SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
+
+EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
+
+EXPORT REGULATION: Licensee agrees to comply with any and all applicable
+U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
+
+SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
+
+NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
+
+GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
+
+ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
+
+
+
+************************************************************************
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
+
+1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a56359a660d236668b954db8c32a225b52bd5f8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__init__.py
@@ -0,0 +1,329 @@
+# Openpose
+# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
+# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
+# 3rd Edited by ControlNet
+# 4th Edited by ControlNet (added face and correct hands)
+# 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
+# This preprocessor is licensed by CMU for non-commercial use only.
+
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+import json
+import torch
+import numpy as np
+from . import util
+from .body import Body, BodyResult, Keypoint
+from .hand import Hand
+from .face import Face
+from .types import PoseResult, HandResult, FaceResult, AnimalPoseResult
+from huggingface_hub import hf_hub_download
+from .wholebody import Wholebody
+import warnings
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download
+import cv2
+from PIL import Image
+from .animalpose import AnimalPoseImage
+
+from typing import Tuple, List, Callable, Union, Optional
+
+
+def draw_animalposes(animals: list[list[Keypoint]], H: int, W: int) -> np.ndarray:
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
+ for animal_pose in animals:
+ canvas = draw_animalpose(canvas, animal_pose)
+ return canvas
+
+
+def draw_animalpose(canvas: np.ndarray, keypoints: list[Keypoint]) -> np.ndarray:
+ # order of the keypoints for AP10k and a standardized list of colors for limbs
+ keypointPairsList = [
+ (1, 2),
+ (2, 3),
+ (1, 3),
+ (3, 4),
+ (4, 9),
+ (9, 10),
+ (10, 11),
+ (4, 6),
+ (6, 7),
+ (7, 8),
+ (4, 5),
+ (5, 15),
+ (15, 16),
+ (16, 17),
+ (5, 12),
+ (12, 13),
+ (13, 14),
+ ]
+ colorsList = [
+ (255, 255, 255),
+ (100, 255, 100),
+ (150, 255, 255),
+ (100, 50, 255),
+ (50, 150, 200),
+ (0, 255, 255),
+ (0, 150, 0),
+ (0, 0, 255),
+ (0, 0, 150),
+ (255, 50, 255),
+ (255, 0, 255),
+ (255, 0, 0),
+ (150, 0, 0),
+ (255, 255, 100),
+ (0, 150, 0),
+ (255, 255, 0),
+ (150, 150, 150),
+ ] # 16 colors needed
+
+ for ind, (i, j) in enumerate(keypointPairsList):
+ p1 = keypoints[i - 1]
+ p2 = keypoints[j - 1]
+
+ if p1 is not None and p2 is not None:
+ cv2.line(
+ canvas,
+ (int(p1.x), int(p1.y)),
+ (int(p2.x), int(p2.y)),
+ colorsList[ind],
+ 5,
+ )
+ return canvas
+
+
+def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True, xinsr_stick_scaling=False):
+ """
+ Draw the detected poses on an empty canvas.
+
+ Args:
+ poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
+ H (int): The height of the canvas.
+ W (int): The width of the canvas.
+ draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
+ draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
+ draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
+
+ Returns:
+ numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
+ """
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
+
+ for pose in poses:
+ if draw_body:
+ canvas = util.draw_bodypose(canvas, pose.body.keypoints, xinsr_stick_scaling)
+
+ if draw_hand:
+ canvas = util.draw_handpose(canvas, pose.left_hand)
+ canvas = util.draw_handpose(canvas, pose.right_hand)
+
+ if draw_face:
+ canvas = util.draw_facepose(canvas, pose.face)
+
+ return canvas
+
+
+def decode_json_as_poses(
+ pose_json: dict,
+) -> Tuple[List[PoseResult], List[AnimalPoseResult], int, int]:
+ """Decode the json_string complying with the openpose JSON output format
+ to poses that controlnet recognizes.
+ https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md
+
+ Args:
+ json_string: The json string to decode.
+
+ Returns:
+ human_poses
+ animal_poses
+ canvas_height
+ canvas_width
+ """
+ height = pose_json["canvas_height"]
+ width = pose_json["canvas_width"]
+
+ def chunks(lst, n):
+ """Yield successive n-sized chunks from lst."""
+ for i in range(0, len(lst), n):
+ yield lst[i : i + n]
+
+ def decompress_keypoints(
+ numbers: Optional[List[float]],
+ ) -> Optional[List[Optional[Keypoint]]]:
+ if not numbers:
+ return None
+
+ assert len(numbers) % 3 == 0
+
+ def create_keypoint(x, y, c):
+ if c < 1.0:
+ return None
+ keypoint = Keypoint(x, y)
+ return keypoint
+
+ return [create_keypoint(x, y, c) for x, y, c in chunks(numbers, n=3)]
+
+ return (
+ [
+ PoseResult(
+ body=BodyResult(
+ keypoints=decompress_keypoints(pose.get("pose_keypoints_2d"))
+ ),
+ left_hand=decompress_keypoints(pose.get("hand_left_keypoints_2d")),
+ right_hand=decompress_keypoints(pose.get("hand_right_keypoints_2d")),
+ face=decompress_keypoints(pose.get("face_keypoints_2d")),
+ )
+ for pose in pose_json.get("people", [])
+ ],
+ [decompress_keypoints(pose) for pose in pose_json.get("animals", [])],
+ height,
+ width,
+ )
+
+
+def encode_poses_as_dict(poses: List[PoseResult], canvas_height: int, canvas_width: int) -> str:
+ """ Encode the pose as a dict following openpose JSON output format:
+ https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md
+ """
+ def compress_keypoints(keypoints: Union[List[Keypoint], None]) -> Union[List[float], None]:
+ if not keypoints:
+ return None
+
+ return [
+ value
+ for keypoint in keypoints
+ for value in (
+ [float(keypoint.x), float(keypoint.y), 1.0]
+ if keypoint is not None
+ else [0.0, 0.0, 0.0]
+ )
+ ]
+
+ return {
+ 'people': [
+ {
+ 'pose_keypoints_2d': compress_keypoints(pose.body.keypoints),
+ "face_keypoints_2d": compress_keypoints(pose.face),
+ "hand_left_keypoints_2d": compress_keypoints(pose.left_hand),
+ "hand_right_keypoints_2d":compress_keypoints(pose.right_hand),
+ }
+ for pose in poses
+ ],
+ 'canvas_height': canvas_height,
+ 'canvas_width': canvas_width,
+ }
+
+global_cached_dwpose = Wholebody()
+
+class DwposeDetector:
+ """
+ A class for detecting human poses in images using the Dwpose model.
+
+ Attributes:
+ model_dir (str): Path to the directory where the pose models are stored.
+ """
+ def __init__(self, dw_pose_estimation):
+ self.dw_pose_estimation = dw_pose_estimation
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path, pretrained_det_model_or_path=None, det_filename=None, pose_filename=None, torchscript_device="cuda"):
+ global global_cached_dwpose
+ pretrained_det_model_or_path = pretrained_det_model_or_path or pretrained_model_or_path
+ det_filename = det_filename or "yolox_l.onnx"
+ pose_filename = pose_filename or "dw-ll_ucoco_384.onnx"
+ det_model_path = custom_hf_download(pretrained_det_model_or_path, det_filename)
+ pose_model_path = custom_hf_download(pretrained_model_or_path, pose_filename)
+
+ print(f"\nDWPose: Using {det_filename} for bbox detection and {pose_filename} for pose estimation")
+ if global_cached_dwpose.det is None or global_cached_dwpose.det_filename != det_filename:
+ t = Wholebody(det_model_path, None, torchscript_device=torchscript_device)
+ t.pose = global_cached_dwpose.pose
+ t.pose_filename = global_cached_dwpose.pose
+ global_cached_dwpose = t
+
+ if global_cached_dwpose.pose is None or global_cached_dwpose.pose_filename != pose_filename:
+ t = Wholebody(None, pose_model_path, torchscript_device=torchscript_device)
+ t.det = global_cached_dwpose.det
+ t.det_filename = global_cached_dwpose.det_filename
+ global_cached_dwpose = t
+ return cls(global_cached_dwpose)
+
+ def detect_poses(self, oriImg) -> List[PoseResult]:
+ with torch.no_grad():
+ keypoints_info = self.dw_pose_estimation(oriImg.copy())
+ return Wholebody.format_result(keypoints_info)
+
+ def __call__(self, input_image, detect_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", image_and_json=False, upscale_method="INTER_CUBIC", xinsr_stick_scaling=False, **kwargs):
+ if hand_and_face is not None:
+ warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
+ include_hand = hand_and_face
+ include_face = hand_and_face
+
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, _ = resize_image_with_pad(input_image, 0, upscale_method)
+ poses = self.detect_poses(input_image)
+
+ canvas = draw_poses(poses, input_image.shape[0], input_image.shape[1], draw_body=include_body, draw_hand=include_hand, draw_face=include_face, xinsr_stick_scaling=xinsr_stick_scaling)
+ canvas, remove_pad = resize_image_with_pad(canvas, detect_resolution, upscale_method)
+ detected_map = HWC3(remove_pad(canvas))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ if image_and_json:
+ return (detected_map, encode_poses_as_dict(poses, input_image.shape[0], input_image.shape[1]))
+
+ return detected_map
+
+global_cached_animalpose = AnimalPoseImage()
+class AnimalposeDetector:
+ """
+ A class for detecting animal poses in images using the RTMPose AP10k model.
+
+ Attributes:
+ model_dir (str): Path to the directory where the pose models are stored.
+ """
+ def __init__(self, animal_pose_estimation):
+ self.animal_pose_estimation = animal_pose_estimation
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path, pretrained_det_model_or_path=None, det_filename="yolox_l.onnx", pose_filename="dw-ll_ucoco_384.onnx", torchscript_device="cuda"):
+ global global_cached_animalpose
+ det_model_path = custom_hf_download(pretrained_det_model_or_path, det_filename)
+ pose_model_path = custom_hf_download(pretrained_model_or_path, pose_filename)
+
+ print(f"\nAnimalPose: Using {det_filename} for bbox detection and {pose_filename} for pose estimation")
+ if global_cached_animalpose.det is None or global_cached_animalpose.det_filename != det_filename:
+ t = AnimalPoseImage(det_model_path, None, torchscript_device=torchscript_device)
+ t.pose = global_cached_animalpose.pose
+ t.pose_filename = global_cached_animalpose.pose
+ global_cached_animalpose = t
+
+ if global_cached_animalpose.pose is None or global_cached_animalpose.pose_filename != pose_filename:
+ t = AnimalPoseImage(None, pose_model_path, torchscript_device=torchscript_device)
+ t.det = global_cached_animalpose.det
+ t.det_filename = global_cached_animalpose.det_filename
+ global_cached_animalpose = t
+ return cls(global_cached_animalpose)
+
+ def __call__(self, input_image, detect_resolution=512, output_type="pil", image_and_json=False, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ result = self.animal_pose_estimation(input_image)
+ if result is None:
+ detected_map = np.zeros_like(input_image)
+ openpose_dict = {
+ 'version': 'ap10k',
+ 'animals': [],
+ 'canvas_height': input_image.shape[0],
+ 'canvas_width': input_image.shape[1]
+ }
+ else:
+ detected_map, openpose_dict = result
+ detected_map = remove_pad(detected_map)
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ if image_and_json:
+ return (detected_map, openpose_dict)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36a3145fa61dff61b0ef48bef3741af1c6c7e840
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/animalpose.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/animalpose.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..354dac294c0162d33d3aa218461f3eed44e589c1
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/animalpose.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/body.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/body.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3390c686525276b4fbdfb0fdfe6bfeb7ae59bb2
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/body.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/face.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/face.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfe5578330676d98cc74e2a9e637a64d241df86e
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/face.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/hand.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/hand.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..734c3400117fe402ee718c15fc6d829765ffa308
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/hand.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/model.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02a7f88ec84081cc821d70a4564e0c5d21f4038f
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/model.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/types.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/types.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b40aea2174b9129c41a6d6a718e4a46b5d31904
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/types.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/util.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/util.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b6f9cd4b8f713e6ed0da120f88bc053ec18e78b
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/util.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/wholebody.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/wholebody.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04cfc0393a331d3ce03bceca56a32f8c5006e78b
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/__pycache__/wholebody.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/animalpose.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/animalpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..f41d4342dd024e7aabca1e32d40ff529adf2e6b3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/animalpose.py
@@ -0,0 +1,271 @@
+import numpy as np
+import cv2
+import os
+import cv2
+from .dw_onnx.cv_ox_det import inference_detector as inference_onnx_yolox
+from .dw_onnx.cv_ox_yolo_nas import inference_detector as inference_onnx_yolo_nas
+from .dw_onnx.cv_ox_pose import inference_pose as inference_onnx_pose
+
+from .dw_torchscript.jit_det import inference_detector as inference_jit_yolox
+from .dw_torchscript.jit_pose import inference_pose as inference_jit_pose
+from typing import List, Optional
+from .types import PoseResult, BodyResult, Keypoint
+from custom_controlnet_aux.dwpose.util import guess_onnx_input_shape_dtype, get_ort_providers, get_model_type, is_model_torchscript
+from timeit import default_timer
+import torch
+
+def drawBetweenKeypoints(pose_img, keypoints, indexes, color, scaleFactor):
+ ind0 = indexes[0] - 1
+ ind1 = indexes[1] - 1
+
+ point1 = (keypoints[ind0][0], keypoints[ind0][1])
+ point2 = (keypoints[ind1][0], keypoints[ind1][1])
+
+ thickness = int(5 // scaleFactor)
+
+
+ cv2.line(pose_img, (int(point1[0]), int(point1[1])), (int(point2[0]), int(point2[1])), color, thickness)
+
+
+def drawBetweenKeypointsList(pose_img, keypoints, keypointPairsList, colorsList, scaleFactor):
+ for ind, keypointPair in enumerate(keypointPairsList):
+ drawBetweenKeypoints(pose_img, keypoints, keypointPair, colorsList[ind], scaleFactor)
+
+def drawBetweenSetofKeypointLists(pose_img, keypoints_set, keypointPairsList, colorsList, scaleFactor):
+ for keypoints in keypoints_set:
+ drawBetweenKeypointsList(pose_img, keypoints, keypointPairsList, colorsList, scaleFactor)
+
+
+def padImg(img, size, blackBorder=True):
+ left, right, top, bottom = 0, 0, 0, 0
+
+ # pad x
+ if img.shape[1] < size[1]:
+ sidePadding = int((size[1] - img.shape[1]) // 2)
+ left = sidePadding
+ right = sidePadding
+
+ # pad extra on right if padding needed is an odd number
+ if img.shape[1] % 2 == 1:
+ right += 1
+
+ # pad y
+ if img.shape[0] < size[0]:
+ topBottomPadding = int((size[0] - img.shape[0]) // 2)
+ top = topBottomPadding
+ bottom = topBottomPadding
+
+ # pad extra on bottom if padding needed is an odd number
+ if img.shape[0] % 2 == 1:
+ bottom += 1
+
+ if blackBorder:
+ paddedImg = cv2.copyMakeBorder(src=img, top=top, bottom=bottom, left=left, right=right, borderType=cv2.BORDER_CONSTANT, value=(0,0,0))
+ else:
+ paddedImg = cv2.copyMakeBorder(src=img, top=top, bottom=bottom, left=left, right=right, borderType=cv2.BORDER_REPLICATE)
+
+ return paddedImg
+
+def smartCrop(img, size, center):
+
+ width = img.shape[1]
+ height = img.shape[0]
+ xSize = size[1]
+ ySize = size[0]
+ xCenter = center[0]
+ yCenter = center[1]
+
+ if img.shape[0] > size[0] or img.shape[1] > size[1]:
+
+
+ leftMargin = xCenter - xSize//2
+ rightMargin = xCenter + xSize//2
+ upMargin = yCenter - ySize//2
+ downMargin = yCenter + ySize//2
+
+
+ if(leftMargin < 0):
+ xCenter += (-leftMargin)
+ if(rightMargin > width):
+ xCenter -= (rightMargin - width)
+
+ if(upMargin < 0):
+ yCenter -= -upMargin
+ if(downMargin > height):
+ yCenter -= (downMargin - height)
+
+
+ img = cv2.getRectSubPix(img, size, (xCenter, yCenter))
+
+
+
+ return img
+
+
+
+def calculateScaleFactor(img, size, poseSpanX, poseSpanY):
+
+ poseSpanX = max(poseSpanX, size[0])
+
+ scaleFactorX = 1
+
+
+ if poseSpanX > size[0]:
+ scaleFactorX = size[0] / poseSpanX
+
+ scaleFactorY = 1
+ if poseSpanY > size[1]:
+ scaleFactorY = size[1] / poseSpanY
+
+ scaleFactor = min(scaleFactorX, scaleFactorY)
+
+
+ return scaleFactor
+
+
+
+def scaleImg(img, size, poseSpanX, poseSpanY, scaleFactor):
+ scaledImg = img
+
+ scaledImg = cv2.resize(img, (0, 0), fx=scaleFactor, fy=scaleFactor)
+
+ return scaledImg, scaleFactor
+
+class AnimalPoseImage:
+ def __init__(self, det_model_path: Optional[str] = None, pose_model_path: Optional[str] = None, torchscript_device="cuda"):
+ self.det_filename = det_model_path and os.path.basename(det_model_path)
+ self.pose_filename = pose_model_path and os.path.basename(pose_model_path)
+ self.det, self.pose = None, None
+ # return type: None ort cv2 torchscript
+ self.det_model_type = get_model_type("AnimalPose",self.det_filename)
+ self.pose_model_type = get_model_type("AnimalPose",self.pose_filename)
+ # Always loads to CPU to avoid building OpenCV.
+ cv2_device = 'cpu'
+ cv2_backend = cv2.dnn.DNN_BACKEND_OPENCV if cv2_device == 'cpu' else cv2.dnn.DNN_BACKEND_CUDA
+ # You need to manually build OpenCV through cmake to work with your GPU.
+ cv2_providers = cv2.dnn.DNN_TARGET_CPU if cv2_device == 'cpu' else cv2.dnn.DNN_TARGET_CUDA
+ ort_providers = get_ort_providers()
+
+ if self.det_model_type is None:
+ pass
+ elif self.det_model_type == "ort":
+ try:
+ import onnxruntime as ort
+ self.det = ort.InferenceSession(det_model_path, providers=ort_providers)
+ except:
+ print(f"Failed to load onnxruntime with {self.det.get_providers()}.\nPlease change EP_list in the config.yaml and restart ComfyUI")
+ self.det = ort.InferenceSession(det_model_path, providers=["CPUExecutionProvider"])
+ elif self.det_model_type == "cv2":
+ try:
+ self.det = cv2.dnn.readNetFromONNX(det_model_path)
+ self.det.setPreferableBackend(cv2_backend)
+ self.det.setPreferableTarget(cv2_providers)
+ except:
+ print("TopK operators may not work on your OpenCV, try use onnxruntime with CPUExecutionProvider")
+ try:
+ import onnxruntime as ort
+ self.det = ort.InferenceSession(det_model_path, providers=["CPUExecutionProvider"])
+ except:
+ print(f"Failed to load {det_model_path}, you can use other models instead")
+ else:
+ self.det = torch.jit.load(det_model_path)
+ self.det.to(torchscript_device)
+
+ if self.pose_model_type is None:
+ pass
+ elif self.pose_model_type == "ort":
+ try:
+ import onnxruntime as ort
+ self.pose = ort.InferenceSession(pose_model_path, providers=ort_providers)
+ except:
+ print(f"Failed to load onnxruntime with {self.pose.get_providers()}.\nPlease change EP_list in the config.yaml and restart ComfyUI")
+ self.pose = ort.InferenceSession(pose_model_path, providers=["CPUExecutionProvider"])
+ elif self.pose_model_type == "cv2":
+ self.pose = cv2.dnn.readNetFromONNX(pose_model_path)
+ self.pose.setPreferableBackend(cv2_backend)
+ self.pose.setPreferableTarget(cv2_providers)
+ else:
+ self.pose = torch.jit.load(pose_model_path)
+ self.pose.to(torchscript_device)
+
+ if self.pose_filename is not None:
+ self.pose_input_size, _ = guess_onnx_input_shape_dtype(self.pose_filename)
+
+ def __call__(self, oriImg):
+ detect_classes = list(range(14, 23 + 1)) #https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/coco.yaml
+
+ #Sacrifice accurate time measurement for compatibility
+ det_start = default_timer()
+ if is_model_torchscript(self.det):
+ det_result = inference_jit_yolox(self.det, oriImg, detect_classes=detect_classes)
+ else:
+ det_start = default_timer()
+ det_onnx_dtype = np.float32 if "yolox" in self.det_filename else np.uint8
+ if "yolox" in self.det_filename:
+ det_result = inference_onnx_yolox(self.det, oriImg, detect_classes=detect_classes, dtype=det_onnx_dtype)
+ else:
+ #FP16 and INT8 YOLO NAS accept uint8 input
+ det_result = inference_onnx_yolo_nas(self.det, oriImg, detect_classes=detect_classes, dtype=det_onnx_dtype)
+ print(f"AnimalPose: Bbox {((default_timer() - det_start) * 1000):.2f}ms")
+
+ if (det_result is None) or (det_result.shape[0] == 0):
+ openpose_dict = {
+ 'version': 'ap10k',
+ 'animals': [],
+ 'canvas_height': oriImg.shape[0],
+ 'canvas_width': oriImg.shape[1]
+ }
+ return np.zeros_like(oriImg), openpose_dict
+
+ pose_start = default_timer()
+ if is_model_torchscript(self.pose):
+ keypoint_sets, scores = inference_jit_pose(self.pose, det_result, oriImg, self.pose_input_size)
+ else:
+ pose_start = default_timer()
+ _, pose_onnx_dtype = guess_onnx_input_shape_dtype(self.pose_filename)
+ keypoint_sets, scores = inference_onnx_pose(self.pose, det_result, oriImg, self.pose_input_size, dtype=pose_onnx_dtype)
+ print(f"AnimalPose: Pose {((default_timer() - pose_start) * 1000):.2f}ms on {det_result.shape[0]} animals\n")
+
+ animal_kps_scores = []
+ pose_img = np.zeros((oriImg.shape[0], oriImg.shape[1], 3), dtype = np.uint8)
+ for (idx, keypoints) in enumerate(keypoint_sets):
+ # don't use keypoints that go outside the frame in calculations for the center
+ interorKeypoints = keypoints[((keypoints[:,0] > 0) & (keypoints[:,0] < oriImg.shape[1])) & ((keypoints[:,1] > 0) & (keypoints[:,1] < oriImg.shape[0]))]
+
+ xVals = interorKeypoints[:,0]
+ yVals = interorKeypoints[:,1]
+
+ minX = np.amin(xVals)
+ minY = np.amin(yVals)
+ maxX = np.amax(xVals)
+ maxY = np.amax(yVals)
+
+ poseSpanX = maxX - minX
+ poseSpanY = maxY - minY
+
+ # find mean center
+
+ xSum = np.sum(xVals)
+ ySum = np.sum(yVals)
+
+ xCenter = xSum // xVals.shape[0]
+ yCenter = ySum // yVals.shape[0]
+ center_of_keypoints = (xCenter,yCenter)
+
+ # order of the keypoints for AP10k and a standardized list of colors for limbs
+ keypointPairsList = [(1,2), (2,3), (1,3), (3,4), (4,9), (9,10), (10,11), (4,6), (6,7), (7,8), (4,5), (5,15), (15,16), (16,17), (5,12), (12,13), (13,14)]
+ colorsList = [(255,255,255), (100,255,100), (150,255,255), (100,50,255), (50,150,200), (0,255,255), (0,150,0), (0,0,255), (0,0,150), (255,50,255), (255,0,255), (255,0,0), (150,0,0), (255,255,100), (0,150,0), (255,255,0), (150,150,150)] # 16 colors needed
+
+ drawBetweenKeypointsList(pose_img, keypoints, keypointPairsList, colorsList, scaleFactor=1.0)
+ score = scores[idx, ..., None]
+ score[score > 1.0] = 1.0
+ score[score < 0.0] = 0.0
+ animal_kps_scores.append(np.concatenate((keypoints, score), axis=-1))
+
+ openpose_dict = {
+ 'version': 'ap10k',
+ 'animals': [keypoints.tolist() for keypoints in animal_kps_scores],
+ 'canvas_height': oriImg.shape[0],
+ 'canvas_width': oriImg.shape[1]
+ }
+ return pose_img, openpose_dict
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/body.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/body.py
new file mode 100644
index 0000000000000000000000000000000000000000..398d190312086013928e9e15f72bbcb2e958a6c7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/body.py
@@ -0,0 +1,261 @@
+import cv2
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from torchvision import transforms
+from typing import NamedTuple, List, Union
+
+from . import util
+from .model import bodypose_model
+from .types import Keypoint, BodyResult
+
+class Body(object):
+ def __init__(self, model_path):
+ self.model = bodypose_model()
+ # if torch.cuda.is_available():
+ # self.model = self.model.cuda()
+ # print('cuda')
+ model_dict = util.transfer(self.model, torch.load(model_path))
+ self.model.load_state_dict(model_dict)
+ self.model.eval()
+
+ def __call__(self, oriImg):
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
+ scale_search = [0.5]
+ boxsize = 368
+ stride = 8
+ padValue = 128
+ thre1 = 0.1
+ thre2 = 0.05
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+ for m in range(len(multiplier)):
+ scale = multiplier[m]
+ imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+ im = np.ascontiguousarray(im)
+
+ data = torch.from_numpy(im).float()
+ if torch.cuda.is_available():
+ data = data.cuda()
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+ with torch.no_grad():
+ data = data.to(self.cn_device)
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+ # extract outputs, resize, and remove padding
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
+
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
+ paf = util.smart_resize_k(paf, fx=stride, fy=stride)
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
+
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+ paf_avg += + paf / len(multiplier)
+
+ all_peaks = []
+ peak_counter = 0
+
+ for part in range(18):
+ map_ori = heatmap_avg[:, :, part]
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+ map_left = np.zeros(one_heatmap.shape)
+ map_left[1:, :] = one_heatmap[:-1, :]
+ map_right = np.zeros(one_heatmap.shape)
+ map_right[:-1, :] = one_heatmap[1:, :]
+ map_up = np.zeros(one_heatmap.shape)
+ map_up[:, 1:] = one_heatmap[:, :-1]
+ map_down = np.zeros(one_heatmap.shape)
+ map_down[:, :-1] = one_heatmap[:, 1:]
+
+ peaks_binary = np.logical_and.reduce(
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
+ peak_id = range(peak_counter, peak_counter + len(peaks))
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
+
+ all_peaks.append(peaks_with_score_and_id)
+ peak_counter += len(peaks)
+
+ # find connection in the specified sequence, center 29 is in the position 15
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+ [1, 16], [16, 18], [3, 17], [6, 18]]
+ # the middle joints heatmap correpondence
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
+ [55, 56], [37, 38], [45, 46]]
+
+ connection_all = []
+ special_k = []
+ mid_num = 10
+
+ for k in range(len(mapIdx)):
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+ candA = all_peaks[limbSeq[k][0] - 1]
+ candB = all_peaks[limbSeq[k][1] - 1]
+ nA = len(candA)
+ nB = len(candB)
+ indexA, indexB = limbSeq[k]
+ if (nA != 0 and nB != 0):
+ connection_candidate = []
+ for i in range(nA):
+ for j in range(nB):
+ vec = np.subtract(candB[j][:2], candA[i][:2])
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+ norm = max(0.001, norm)
+ vec = np.divide(vec, norm)
+
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
+
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
+ for I in range(len(startend))])
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
+ for I in range(len(startend))])
+
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+ criterion2 = score_with_dist_prior > 0
+ if criterion1 and criterion2:
+ connection_candidate.append(
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
+
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
+ connection = np.zeros((0, 5))
+ for c in range(len(connection_candidate)):
+ i, j, s = connection_candidate[c][0:3]
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
+ if (len(connection) >= min(nA, nB)):
+ break
+
+ connection_all.append(connection)
+ else:
+ special_k.append(k)
+ connection_all.append([])
+
+ # last number in each row is the total parts number of that person
+ # the second last number in each row is the score of the overall configuration
+ subset = -1 * np.ones((0, 20))
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
+
+ for k in range(len(mapIdx)):
+ if k not in special_k:
+ partAs = connection_all[k][:, 0]
+ partBs = connection_all[k][:, 1]
+ indexA, indexB = np.array(limbSeq[k]) - 1
+
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
+ found = 0
+ subset_idx = [-1, -1]
+ for j in range(len(subset)): # 1:size(subset,1):
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
+ subset_idx[found] = j
+ found += 1
+
+ if found == 1:
+ j = subset_idx[0]
+ if subset[j][indexB] != partBs[i]:
+ subset[j][indexB] = partBs[i]
+ subset[j][-1] += 1
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+ elif found == 2: # if found 2 and disjoint, merge them
+ j1, j2 = subset_idx
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
+ subset[j1][-2:] += subset[j2][-2:]
+ subset[j1][-2] += connection_all[k][i][2]
+ subset = np.delete(subset, j2, 0)
+ else: # as like found == 1
+ subset[j1][indexB] = partBs[i]
+ subset[j1][-1] += 1
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+
+ # if find no partA in the subset, create a new subset
+ elif not found and k < 17:
+ row = -1 * np.ones(20)
+ row[indexA] = partAs[i]
+ row[indexB] = partBs[i]
+ row[-1] = 2
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
+ subset = np.vstack([subset, row])
+ # delete some rows of subset which has few parts occur
+ deleteIdx = []
+ for i in range(len(subset)):
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+ deleteIdx.append(i)
+ subset = np.delete(subset, deleteIdx, axis=0)
+
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+ # candidate: x, y, score, id
+ return candidate, subset
+
+ @staticmethod
+ def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
+ """
+ Format the body results from the candidate and subset arrays into a list of BodyResult objects.
+
+ Args:
+ candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
+ for each body part.
+ subset (np.ndarray): An array of subsets containing indices to the candidate array for each
+ person detected. The last two columns of each row hold the total score and total parts
+ of the person.
+
+ Returns:
+ List[BodyResult]: A list of BodyResult objects, where each object represents a person with
+ detected keypoints, total score, and total parts.
+ """
+ return [
+ BodyResult(
+ keypoints=[
+ Keypoint(
+ x=candidate[candidate_index][0],
+ y=candidate[candidate_index][1],
+ score=candidate[candidate_index][2],
+ id=candidate[candidate_index][3]
+ ) if candidate_index != -1 else None
+ for candidate_index in person[:18].astype(int)
+ ],
+ total_score=person[18],
+ total_parts=person[19]
+ )
+ for person in subset
+ ]
+
+
+if __name__ == "__main__":
+ body_estimation = Body('../model/body_pose_model.pth')
+
+ test_image = '../images/ski.jpg'
+ oriImg = cv2.imread(test_image) # B,G,R order
+ candidate, subset = body_estimation(oriImg)
+ bodies = body_estimation.format_body_result(candidate, subset)
+
+ canvas = oriImg
+ for body in bodies:
+ canvas = util.draw_bodypose(canvas, body)
+
+ plt.imshow(canvas[:, :, [2, 1, 0]])
+ plt.show()
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e7a7f594ef441479257c788e4c0d6e08657fc8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__init__.py
@@ -0,0 +1 @@
+#Dummy file ensuring this package will be recognized
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b294fd1043ca3a67146b8df4872b5d1948e249e5
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_det.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_det.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcd504974442eadffae6a523535a85f974bf0ab0
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_det.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_pose.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_pose.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4e8b06952eeeef6fe18797e941aae1f351122df
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_pose.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_yolo_nas.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_yolo_nas.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49562a368af2aa9a9ce2fe9569a4515d4f21acbf
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/__pycache__/cv_ox_yolo_nas.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_det.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_det.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee4ad9e4f6d8a8111683f488a68e0ff453dd3ad
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_det.py
@@ -0,0 +1,129 @@
+import cv2
+import numpy as np
+
+def nms(boxes, scores, nms_thr):
+ """Single class NMS implemented in Numpy."""
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ x2 = boxes[:, 2]
+ y2 = boxes[:, 3]
+
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ order = scores.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ i = order[0]
+ keep.append(i)
+ xx1 = np.maximum(x1[i], x1[order[1:]])
+ yy1 = np.maximum(y1[i], y1[order[1:]])
+ xx2 = np.minimum(x2[i], x2[order[1:]])
+ yy2 = np.minimum(y2[i], y2[order[1:]])
+
+ w = np.maximum(0.0, xx2 - xx1 + 1)
+ h = np.maximum(0.0, yy2 - yy1 + 1)
+ inter = w * h
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+ inds = np.where(ovr <= nms_thr)[0]
+ order = order[inds + 1]
+
+ return keep
+
+def multiclass_nms(boxes, scores, nms_thr, score_thr):
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
+ final_dets = []
+ num_classes = scores.shape[1]
+ for cls_ind in range(num_classes):
+ cls_scores = scores[:, cls_ind]
+ valid_score_mask = cls_scores > score_thr
+ if valid_score_mask.sum() == 0:
+ continue
+ else:
+ valid_scores = cls_scores[valid_score_mask]
+ valid_boxes = boxes[valid_score_mask]
+ keep = nms(valid_boxes, valid_scores, nms_thr)
+ if len(keep) > 0:
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
+ dets = np.concatenate(
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+ )
+ final_dets.append(dets)
+ if len(final_dets) == 0:
+ return None
+ return np.concatenate(final_dets, 0)
+
+def demo_postprocess(outputs, img_size, p6=False):
+ grids = []
+ expanded_strides = []
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
+
+ hsizes = [img_size[0] // stride for stride in strides]
+ wsizes = [img_size[1] // stride for stride in strides]
+
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+ grids.append(grid)
+ shape = grid.shape[:2]
+ expanded_strides.append(np.full((*shape, 1), stride))
+
+ grids = np.concatenate(grids, 1)
+ expanded_strides = np.concatenate(expanded_strides, 1)
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+
+ return outputs
+
+def preprocess(img, input_size, swap=(2, 0, 1)):
+ if len(img.shape) == 3:
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+ else:
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
+ interpolation=cv2.INTER_LINEAR,
+ ).astype(np.uint8)
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+ padded_img = padded_img.transpose(swap)
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+ return padded_img, r
+
+def inference_detector(session, oriImg, detect_classes=[0], dtype=np.float32):
+ input_shape = (640,640)
+ img, ratio = preprocess(oriImg, input_shape)
+
+ input = img[None, :, :, :]
+ input = input.astype(dtype)
+ if "InferenceSession" in type(session).__name__:
+ input_name = session.get_inputs()[0].name
+ output = session.run(None, {input_name: input})
+ else:
+ outNames = session.getUnconnectedOutLayersNames()
+ session.setInput(input)
+ output = session.forward(outNames)
+
+ predictions = demo_postprocess(output[0], input_shape)[0]
+
+ boxes = predictions[:, :4]
+ scores = predictions[:, 4:5] * predictions[:, 5:]
+
+ boxes_xyxy = np.ones_like(boxes)
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
+ boxes_xyxy /= ratio
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
+ if dets is None:
+ return None
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
+ isscore = final_scores>0.3
+ iscat = np.isin(final_cls_inds, detect_classes)
+ isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
+ final_boxes = final_boxes[isbbox]
+ return final_boxes
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_pose.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b4bb44c691bd624fa68030d1b07d9f9e737c378
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_pose.py
@@ -0,0 +1,363 @@
+from typing import List, Tuple
+
+import cv2
+import numpy as np
+
+def preprocess(
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """Do preprocessing for DWPose model inference.
+
+ Args:
+ img (np.ndarray): Input image in shape.
+ input_size (tuple): Input image size in shape (w, h).
+
+ Returns:
+ tuple:
+ - resized_img (np.ndarray): Preprocessed image.
+ - center (np.ndarray): Center of image.
+ - scale (np.ndarray): Scale of image.
+ """
+ # get shape of image
+ img_shape = img.shape[:2]
+ out_img, out_center, out_scale = [], [], []
+ if len(out_bbox) == 0:
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
+ for i in range(len(out_bbox)):
+ x0 = out_bbox[i][0]
+ y0 = out_bbox[i][1]
+ x1 = out_bbox[i][2]
+ y1 = out_bbox[i][3]
+ bbox = np.array([x0, y0, x1, y1])
+
+ # get center and scale
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
+
+ # do affine transformation
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
+
+ # normalize image
+ mean = np.array([123.675, 116.28, 103.53])
+ std = np.array([58.395, 57.12, 57.375])
+ resized_img = (resized_img - mean) / std
+
+ out_img.append(resized_img)
+ out_center.append(center)
+ out_scale.append(scale)
+
+ return out_img, out_center, out_scale
+
+
+def inference(sess, img, dtype=np.float32):
+ """Inference DWPose model. Processing all image segments at once to take advantage of GPU's parallelism ability if onnxruntime is installed
+
+ Args:
+ sess : ONNXRuntime session.
+ img : Input image in shape.
+
+ Returns:
+ outputs : Output of DWPose model.
+ """
+ all_out = []
+ # build input
+ input = np.stack(img, axis=0).transpose(0, 3, 1, 2)
+ input = input.astype(dtype)
+ if "InferenceSession" in type(sess).__name__:
+ input_name = sess.get_inputs()[0].name
+ all_outputs = sess.run(None, {input_name: input})
+ for batch_idx in range(len(all_outputs[0])):
+ outputs = [all_outputs[i][batch_idx:batch_idx+1,...] for i in range(len(all_outputs))]
+ all_out.append(outputs)
+ return all_out
+
+ #OpenCV doesn't support batch processing sadly
+ for i in range(len(img)):
+ input = img[i].transpose(2, 0, 1)
+ input = input[None, :, :, :]
+
+ outNames = sess.getUnconnectedOutLayersNames()
+ sess.setInput(input)
+ outputs = sess.forward(outNames)
+ all_out.append(outputs)
+
+ return all_out
+
+def postprocess(outputs: List[np.ndarray],
+ model_input_size: Tuple[int, int],
+ center: Tuple[int, int],
+ scale: Tuple[int, int],
+ simcc_split_ratio: float = 2.0
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ """Postprocess for DWPose model output.
+
+ Args:
+ outputs (np.ndarray): Output of RTMPose model.
+ model_input_size (tuple): RTMPose model Input image size.
+ center (tuple): Center of bbox in shape (x, y).
+ scale (tuple): Scale of bbox in shape (w, h).
+ simcc_split_ratio (float): Split ratio of simcc.
+
+ Returns:
+ tuple:
+ - keypoints (np.ndarray): Rescaled keypoints.
+ - scores (np.ndarray): Model predict scores.
+ """
+ all_key = []
+ all_score = []
+ for i in range(len(outputs)):
+ # use simcc to decode
+ simcc_x, simcc_y = outputs[i]
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
+
+ # rescale keypoints
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
+ all_key.append(keypoints[0])
+ all_score.append(scores[0])
+
+ return np.array(all_key), np.array(all_score)
+
+
+def bbox_xyxy2cs(bbox: np.ndarray,
+ padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
+
+ Args:
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
+ as (left, top, right, bottom)
+ padding (float): BBox padding factor that will be multilied to scale.
+ Default: 1.0
+
+ Returns:
+ tuple: A tuple containing center and scale.
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
+ (n, 2)
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
+ (n, 2)
+ """
+ # convert single bbox from (4, ) to (1, 4)
+ dim = bbox.ndim
+ if dim == 1:
+ bbox = bbox[None, :]
+
+ # get bbox center and scale
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
+
+ if dim == 1:
+ center = center[0]
+ scale = scale[0]
+
+ return center, scale
+
+
+def _fix_aspect_ratio(bbox_scale: np.ndarray,
+ aspect_ratio: float) -> np.ndarray:
+ """Extend the scale to match the given aspect ratio.
+
+ Args:
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
+ aspect_ratio (float): The ratio of ``w/h``
+
+ Returns:
+ np.ndarray: The reshaped image scale in (2, )
+ """
+ w, h = np.hsplit(bbox_scale, [1])
+ bbox_scale = np.where(w > h * aspect_ratio,
+ np.hstack([w, w / aspect_ratio]),
+ np.hstack([h * aspect_ratio, h]))
+ return bbox_scale
+
+
+def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
+ """Rotate a point by an angle.
+
+ Args:
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
+ angle_rad (float): rotation angle in radian
+
+ Returns:
+ np.ndarray: Rotated point in shape (2, )
+ """
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
+ return rot_mat @ pt
+
+
+def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+ """To calculate the affine matrix, three pairs of points are required. This
+ function is used to get the 3rd point, given 2D points a & b.
+
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
+ anticlockwise, using b as the rotation center.
+
+ Args:
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
+
+ Returns:
+ np.ndarray: The 3rd point.
+ """
+ direction = a - b
+ c = b + np.r_[-direction[1], direction[0]]
+ return c
+
+
+def get_warp_matrix(center: np.ndarray,
+ scale: np.ndarray,
+ rot: float,
+ output_size: Tuple[int, int],
+ shift: Tuple[float, float] = (0., 0.),
+ inv: bool = False) -> np.ndarray:
+ """Calculate the affine transformation matrix that can warp the bbox area
+ in the input image to the output size.
+
+ Args:
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
+ scale (np.ndarray[2, ]): Scale of the bounding box
+ wrt [width, height].
+ rot (float): Rotation angle (degree).
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
+ destination heatmaps.
+ shift (0-100%): Shift translation ratio wrt the width/height.
+ Default (0., 0.).
+ inv (bool): Option to inverse the affine transform direction.
+ (inv=False: src->dst or inv=True: dst->src)
+
+ Returns:
+ np.ndarray: A 2x3 transformation matrix
+ """
+ shift = np.array(shift)
+ src_w = scale[0]
+ dst_w = output_size[0]
+ dst_h = output_size[1]
+
+ # compute transformation matrix
+ rot_rad = np.deg2rad(rot)
+ src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
+ dst_dir = np.array([0., dst_w * -0.5])
+
+ # get four corners of the src rectangle in the original image
+ src = np.zeros((3, 2), dtype=np.float32)
+ src[0, :] = center + scale * shift
+ src[1, :] = center + src_dir + scale * shift
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+ # get four corners of the dst rectangle in the input image
+ dst = np.zeros((3, 2), dtype=np.float32)
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+ if inv:
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+ else:
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+ return warp_mat
+
+
+def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
+ img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """Get the bbox image as the model input by affine transform.
+
+ Args:
+ input_size (dict): The input size of the model.
+ bbox_scale (dict): The bbox scale of the img.
+ bbox_center (dict): The bbox center of the img.
+ img (np.ndarray): The original image.
+
+ Returns:
+ tuple: A tuple containing center and scale.
+ - np.ndarray[float32]: img after affine transform.
+ - np.ndarray[float32]: bbox scale after affine transform.
+ """
+ w, h = input_size
+ warp_size = (int(w), int(h))
+
+ # reshape bbox to fixed aspect ratio
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
+
+ # get the affine matrix
+ center = bbox_center
+ scale = bbox_scale
+ rot = 0
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
+
+ # do affine transform
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
+
+ return img, bbox_scale
+
+
+def get_simcc_maximum(simcc_x: np.ndarray,
+ simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """Get maximum response location and value from simcc representations.
+
+ Note:
+ instance number: N
+ num_keypoints: K
+ heatmap height: H
+ heatmap width: W
+
+ Args:
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
+
+ Returns:
+ tuple:
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
+ (K, 2) or (N, K, 2)
+ - vals (np.ndarray): values of maximum heatmap responses in shape
+ (K,) or (N, K)
+ """
+ N, K, Wx = simcc_x.shape
+ simcc_x = simcc_x.reshape(N * K, -1)
+ simcc_y = simcc_y.reshape(N * K, -1)
+
+ # get maximum value locations
+ x_locs = np.argmax(simcc_x, axis=1)
+ y_locs = np.argmax(simcc_y, axis=1)
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+ max_val_x = np.amax(simcc_x, axis=1)
+ max_val_y = np.amax(simcc_y, axis=1)
+
+ # get maximum value across x and y axis
+ mask = max_val_x > max_val_y
+ max_val_x[mask] = max_val_y[mask]
+ vals = max_val_x
+ locs[vals <= 0.] = -1
+
+ # reshape
+ locs = locs.reshape(N, K, 2)
+ vals = vals.reshape(N, K)
+
+ return locs, vals
+
+
+def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
+ simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
+ """Modulate simcc distribution with Gaussian.
+
+ Args:
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
+ simcc_split_ratio (int): The split ratio of simcc.
+
+ Returns:
+ tuple: A tuple containing center and scale.
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
+ """
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
+ keypoints /= simcc_split_ratio
+
+ return keypoints, scores
+
+
+def inference_pose(session, out_bbox, oriImg, model_input_size=(288, 384), dtype=np.float32):
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
+ outputs = inference(session, resized_img, dtype)
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
+
+ return keypoints, scores
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_yolo_nas.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_yolo_nas.py
new file mode 100644
index 0000000000000000000000000000000000000000..119e01bea70734b6777fef4f6a179fe85da18a70
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_onnx/cv_ox_yolo_nas.py
@@ -0,0 +1,60 @@
+# Source: https://github.com/Hyuto/yolo-nas-onnx/tree/master/yolo-nas-py
+# Inspired from: https://github.com/Deci-AI/super-gradients/blob/3.1.1/src/super_gradients/training/processing/processing.py
+
+import numpy as np
+import cv2
+
+def preprocess(img, input_size, swap=(2, 0, 1)):
+ if len(img.shape) == 3:
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+ else:
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
+ interpolation=cv2.INTER_LINEAR,
+ ).astype(np.uint8)
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+ padded_img = padded_img.transpose(swap)
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+ return padded_img, r
+
+def inference_detector(session, oriImg, detect_classes=[0], dtype=np.uint8):
+ """
+ This function is only compatible with onnx models exported from the new API with built-in NMS
+ ```py
+ from super_gradients.conversion.conversion_enums import ExportQuantizationMode
+ from super_gradients.common.object_names import Models
+ from super_gradients.training import models
+
+ model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
+
+ export_result = model.export(
+ "yolo_nas/yolo_nas_l_fp16.onnx",
+ quantization_mode=ExportQuantizationMode.FP16,
+ device="cuda"
+ )
+ ```
+ """
+ input_shape = (640,640)
+ img, ratio = preprocess(oriImg, input_shape)
+ input = img[None, :, :, :]
+ input = input.astype(dtype)
+ if "InferenceSession" in type(session).__name__:
+ input_name = session.get_inputs()[0].name
+ output = session.run(None, {input_name: input})
+ else:
+ outNames = session.getUnconnectedOutLayersNames()
+ session.setInput(input)
+ output = session.forward(outNames)
+ num_preds, pred_boxes, pred_scores, pred_classes = output
+ num_preds = num_preds[0,0]
+ if num_preds == 0:
+ return None
+ idxs = np.where((np.isin(pred_classes[0, :num_preds], detect_classes)) & (pred_scores[0, :num_preds] > 0.3))
+ if (len(idxs) == 0) or (idxs[0].size == 0):
+ return None
+ return pred_boxes[0, idxs].squeeze(axis=0) / ratio
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e7a7f594ef441479257c788e4c0d6e08657fc8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__init__.py
@@ -0,0 +1 @@
+#Dummy file ensuring this package will be recognized
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..354d38b1752c88cd0b1c359d3dcbca0ca16bd25b
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/jit_det.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/jit_det.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4141623b36025792eada4e1c589355012c283714
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/jit_det.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/jit_pose.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/jit_pose.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d156f19079cc700d14d20fdc87c5e5596134ea1
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/__pycache__/jit_pose.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/jit_det.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/jit_det.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95d1a9620c5b223706ed2bc7c4df14fce0455c2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/jit_det.py
@@ -0,0 +1,125 @@
+import cv2
+import numpy as np
+import torch
+
+def nms(boxes, scores, nms_thr):
+ """Single class NMS implemented in Numpy."""
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ x2 = boxes[:, 2]
+ y2 = boxes[:, 3]
+
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ order = scores.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ i = order[0]
+ keep.append(i)
+ xx1 = np.maximum(x1[i], x1[order[1:]])
+ yy1 = np.maximum(y1[i], y1[order[1:]])
+ xx2 = np.minimum(x2[i], x2[order[1:]])
+ yy2 = np.minimum(y2[i], y2[order[1:]])
+
+ w = np.maximum(0.0, xx2 - xx1 + 1)
+ h = np.maximum(0.0, yy2 - yy1 + 1)
+ inter = w * h
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+ inds = np.where(ovr <= nms_thr)[0]
+ order = order[inds + 1]
+
+ return keep
+
+def multiclass_nms(boxes, scores, nms_thr, score_thr):
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
+ final_dets = []
+ num_classes = scores.shape[1]
+ for cls_ind in range(num_classes):
+ cls_scores = scores[:, cls_ind]
+ valid_score_mask = cls_scores > score_thr
+ if valid_score_mask.sum() == 0:
+ continue
+ else:
+ valid_scores = cls_scores[valid_score_mask]
+ valid_boxes = boxes[valid_score_mask]
+ keep = nms(valid_boxes, valid_scores, nms_thr)
+ if len(keep) > 0:
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
+ dets = np.concatenate(
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+ )
+ final_dets.append(dets)
+ if len(final_dets) == 0:
+ return None
+ return np.concatenate(final_dets, 0)
+
+def demo_postprocess(outputs, img_size, p6=False):
+ grids = []
+ expanded_strides = []
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
+
+ hsizes = [img_size[0] // stride for stride in strides]
+ wsizes = [img_size[1] // stride for stride in strides]
+
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+ grids.append(grid)
+ shape = grid.shape[:2]
+ expanded_strides.append(np.full((*shape, 1), stride))
+
+ grids = np.concatenate(grids, 1)
+ expanded_strides = np.concatenate(expanded_strides, 1)
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+
+ return outputs
+
+def preprocess(img, input_size, swap=(2, 0, 1)):
+ if len(img.shape) == 3:
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+ else:
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
+ interpolation=cv2.INTER_LINEAR,
+ ).astype(np.uint8)
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+ padded_img = padded_img.transpose(swap)
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+ return padded_img, r
+
+def inference_detector(model, oriImg, detect_classes=[0]):
+ input_shape = (640,640)
+ img, ratio = preprocess(oriImg, input_shape)
+
+ device, dtype = next(model.parameters()).device, next(model.parameters()).dtype
+ input = img[None, :, :, :]
+ input = torch.from_numpy(input).to(device, dtype)
+
+ output = model(input).float().cpu().detach().numpy()
+ predictions = demo_postprocess(output[0], input_shape)
+
+ boxes = predictions[:, :4]
+ scores = predictions[:, 4:5] * predictions[:, 5:]
+
+ boxes_xyxy = np.ones_like(boxes)
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
+ boxes_xyxy /= ratio
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
+ if dets is None:
+ return None
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
+ isscore = final_scores>0.3
+ iscat = np.isin(final_cls_inds, detect_classes)
+ isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
+ final_boxes = final_boxes[isbbox]
+ return final_boxes
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/jit_pose.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/jit_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d297a8bbaef69acdd259b45a071f31e98019f8a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/dw_torchscript/jit_pose.py
@@ -0,0 +1,363 @@
+from typing import List, Tuple
+
+import cv2
+import numpy as np
+import torch
+
+def preprocess(
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """Do preprocessing for DWPose model inference.
+
+ Args:
+ img (np.ndarray): Input image in shape.
+ input_size (tuple): Input image size in shape (w, h).
+
+ Returns:
+ tuple:
+ - resized_img (np.ndarray): Preprocessed image.
+ - center (np.ndarray): Center of image.
+ - scale (np.ndarray): Scale of image.
+ """
+ # get shape of image
+ img_shape = img.shape[:2]
+ out_img, out_center, out_scale = [], [], []
+ if len(out_bbox) == 0:
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
+ for i in range(len(out_bbox)):
+ x0 = out_bbox[i][0]
+ y0 = out_bbox[i][1]
+ x1 = out_bbox[i][2]
+ y1 = out_bbox[i][3]
+ bbox = np.array([x0, y0, x1, y1])
+
+ # get center and scale
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
+
+ # do affine transformation
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
+
+ # normalize image
+ mean = np.array([123.675, 116.28, 103.53])
+ std = np.array([58.395, 57.12, 57.375])
+ resized_img = (resized_img - mean) / std
+
+ out_img.append(resized_img)
+ out_center.append(center)
+ out_scale.append(scale)
+
+ return out_img, out_center, out_scale
+
+def inference(model, img, bs=5):
+ """Inference DWPose model implemented in TorchScript.
+
+ Args:
+ model : TorchScript Model.
+ img : Input image in shape.
+
+ Returns:
+ outputs : Output of DWPose model.
+ """
+ all_out = []
+ # build input
+ orig_img_count = len(img)
+ #Pad zeros to fit batch size
+ for _ in range(bs - (orig_img_count % bs)):
+ img.append(np.zeros_like(img[0]))
+ input = np.stack(img, axis=0).transpose(0, 3, 1, 2)
+ device, dtype = next(model.parameters()).device, next(model.parameters()).dtype
+ input = torch.from_numpy(input).to(device, dtype)
+
+ out1, out2 = [], []
+ for i in range(input.shape[0] // bs):
+ curr_batch_output = model(input[i*bs:(i+1)*bs])
+ out1.append(curr_batch_output[0].float())
+ out2.append(curr_batch_output[1].float())
+ out1, out2 = torch.cat(out1, dim=0)[:orig_img_count], torch.cat(out2, dim=0)[:orig_img_count]
+ out1, out2 = out1.float().cpu().detach().numpy(), out2.float().cpu().detach().numpy()
+ all_outputs = out1, out2
+
+ for batch_idx in range(len(all_outputs[0])):
+ outputs = [all_outputs[i][batch_idx:batch_idx+1,...] for i in range(len(all_outputs))]
+ all_out.append(outputs)
+ return all_out
+def postprocess(outputs: List[np.ndarray],
+ model_input_size: Tuple[int, int],
+ center: Tuple[int, int],
+ scale: Tuple[int, int],
+ simcc_split_ratio: float = 2.0
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ """Postprocess for DWPose model output.
+
+ Args:
+ outputs (np.ndarray): Output of RTMPose model.
+ model_input_size (tuple): RTMPose model Input image size.
+ center (tuple): Center of bbox in shape (x, y).
+ scale (tuple): Scale of bbox in shape (w, h).
+ simcc_split_ratio (float): Split ratio of simcc.
+
+ Returns:
+ tuple:
+ - keypoints (np.ndarray): Rescaled keypoints.
+ - scores (np.ndarray): Model predict scores.
+ """
+ all_key = []
+ all_score = []
+ for i in range(len(outputs)):
+ # use simcc to decode
+ simcc_x, simcc_y = outputs[i]
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
+
+ # rescale keypoints
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
+ all_key.append(keypoints[0])
+ all_score.append(scores[0])
+
+ return np.array(all_key), np.array(all_score)
+
+
+def bbox_xyxy2cs(bbox: np.ndarray,
+ padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
+
+ Args:
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
+ as (left, top, right, bottom)
+ padding (float): BBox padding factor that will be multilied to scale.
+ Default: 1.0
+
+ Returns:
+ tuple: A tuple containing center and scale.
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
+ (n, 2)
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
+ (n, 2)
+ """
+ # convert single bbox from (4, ) to (1, 4)
+ dim = bbox.ndim
+ if dim == 1:
+ bbox = bbox[None, :]
+
+ # get bbox center and scale
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
+
+ if dim == 1:
+ center = center[0]
+ scale = scale[0]
+
+ return center, scale
+
+
+def _fix_aspect_ratio(bbox_scale: np.ndarray,
+ aspect_ratio: float) -> np.ndarray:
+ """Extend the scale to match the given aspect ratio.
+
+ Args:
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
+ aspect_ratio (float): The ratio of ``w/h``
+
+ Returns:
+ np.ndarray: The reshaped image scale in (2, )
+ """
+ w, h = np.hsplit(bbox_scale, [1])
+ bbox_scale = np.where(w > h * aspect_ratio,
+ np.hstack([w, w / aspect_ratio]),
+ np.hstack([h * aspect_ratio, h]))
+ return bbox_scale
+
+
+def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
+ """Rotate a point by an angle.
+
+ Args:
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
+ angle_rad (float): rotation angle in radian
+
+ Returns:
+ np.ndarray: Rotated point in shape (2, )
+ """
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
+ return rot_mat @ pt
+
+
+def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+ """To calculate the affine matrix, three pairs of points are required. This
+ function is used to get the 3rd point, given 2D points a & b.
+
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
+ anticlockwise, using b as the rotation center.
+
+ Args:
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
+
+ Returns:
+ np.ndarray: The 3rd point.
+ """
+ direction = a - b
+ c = b + np.r_[-direction[1], direction[0]]
+ return c
+
+
+def get_warp_matrix(center: np.ndarray,
+ scale: np.ndarray,
+ rot: float,
+ output_size: Tuple[int, int],
+ shift: Tuple[float, float] = (0., 0.),
+ inv: bool = False) -> np.ndarray:
+ """Calculate the affine transformation matrix that can warp the bbox area
+ in the input image to the output size.
+
+ Args:
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
+ scale (np.ndarray[2, ]): Scale of the bounding box
+ wrt [width, height].
+ rot (float): Rotation angle (degree).
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
+ destination heatmaps.
+ shift (0-100%): Shift translation ratio wrt the width/height.
+ Default (0., 0.).
+ inv (bool): Option to inverse the affine transform direction.
+ (inv=False: src->dst or inv=True: dst->src)
+
+ Returns:
+ np.ndarray: A 2x3 transformation matrix
+ """
+ shift = np.array(shift)
+ src_w = scale[0]
+ dst_w = output_size[0]
+ dst_h = output_size[1]
+
+ # compute transformation matrix
+ rot_rad = np.deg2rad(rot)
+ src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
+ dst_dir = np.array([0., dst_w * -0.5])
+
+ # get four corners of the src rectangle in the original image
+ src = np.zeros((3, 2), dtype=np.float32)
+ src[0, :] = center + scale * shift
+ src[1, :] = center + src_dir + scale * shift
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+ # get four corners of the dst rectangle in the input image
+ dst = np.zeros((3, 2), dtype=np.float32)
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+ if inv:
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+ else:
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+ return warp_mat
+
+
+def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
+ img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """Get the bbox image as the model input by affine transform.
+
+ Args:
+ input_size (dict): The input size of the model.
+ bbox_scale (dict): The bbox scale of the img.
+ bbox_center (dict): The bbox center of the img.
+ img (np.ndarray): The original image.
+
+ Returns:
+ tuple: A tuple containing center and scale.
+ - np.ndarray[float32]: img after affine transform.
+ - np.ndarray[float32]: bbox scale after affine transform.
+ """
+ w, h = input_size
+ warp_size = (int(w), int(h))
+
+ # reshape bbox to fixed aspect ratio
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
+
+ # get the affine matrix
+ center = bbox_center
+ scale = bbox_scale
+ rot = 0
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
+
+ # do affine transform
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
+
+ return img, bbox_scale
+
+
+def get_simcc_maximum(simcc_x: np.ndarray,
+ simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """Get maximum response location and value from simcc representations.
+
+ Note:
+ instance number: N
+ num_keypoints: K
+ heatmap height: H
+ heatmap width: W
+
+ Args:
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
+
+ Returns:
+ tuple:
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
+ (K, 2) or (N, K, 2)
+ - vals (np.ndarray): values of maximum heatmap responses in shape
+ (K,) or (N, K)
+ """
+ N, K, Wx = simcc_x.shape
+ simcc_x = simcc_x.reshape(N * K, -1)
+ simcc_y = simcc_y.reshape(N * K, -1)
+
+ # get maximum value locations
+ x_locs = np.argmax(simcc_x, axis=1)
+ y_locs = np.argmax(simcc_y, axis=1)
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+ max_val_x = np.amax(simcc_x, axis=1)
+ max_val_y = np.amax(simcc_y, axis=1)
+
+ # get maximum value across x and y axis
+ mask = max_val_x > max_val_y
+ max_val_x[mask] = max_val_y[mask]
+ vals = max_val_x
+ locs[vals <= 0.] = -1
+
+ # reshape
+ locs = locs.reshape(N, K, 2)
+ vals = vals.reshape(N, K)
+
+ return locs, vals
+
+
+def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
+ simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
+ """Modulate simcc distribution with Gaussian.
+
+ Args:
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
+ simcc_split_ratio (int): The split ratio of simcc.
+
+ Returns:
+ tuple: A tuple containing center and scale.
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
+ """
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
+ keypoints /= simcc_split_ratio
+
+ return keypoints, scores
+
+def inference_pose(model, out_bbox, oriImg, model_input_size=(288, 384)):
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
+ #outputs = inference(session, resized_img, dtype)
+ outputs = inference(model, resized_img)
+
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
+
+ return keypoints, scores
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/face.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/face.py
new file mode 100644
index 0000000000000000000000000000000000000000..c294fbd5b112e9ca51bcaaa1a97be0ba6ccfb024
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/face.py
@@ -0,0 +1,362 @@
+import logging
+import numpy as np
+from torchvision.transforms import ToTensor, ToPILImage
+import torch
+import torch.nn.functional as F
+import cv2
+
+from . import util
+from torch.nn import Conv2d, Module, ReLU, MaxPool2d, init
+
+
+class FaceNet(Module):
+ """Model the cascading heatmaps. """
+ def __init__(self):
+ super(FaceNet, self).__init__()
+ # cnn to make feature map
+ self.relu = ReLU()
+ self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
+ self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
+ kernel_size=3, stride=1, padding=1)
+ self.conv1_2 = Conv2d(
+ in_channels=64, out_channels=64, kernel_size=3, stride=1,
+ padding=1)
+ self.conv2_1 = Conv2d(
+ in_channels=64, out_channels=128, kernel_size=3, stride=1,
+ padding=1)
+ self.conv2_2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_1 = Conv2d(
+ in_channels=128, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_2 = Conv2d(
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_3 = Conv2d(
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_4 = Conv2d(
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_1 = Conv2d(
+ in_channels=256, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_2 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_3 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_4 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv5_1 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv5_2 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv5_3_CPM = Conv2d(
+ in_channels=512, out_channels=128, kernel_size=3, stride=1,
+ padding=1)
+
+ # stage1
+ self.conv6_1_CPM = Conv2d(
+ in_channels=128, out_channels=512, kernel_size=1, stride=1,
+ padding=0)
+ self.conv6_2_CPM = Conv2d(
+ in_channels=512, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage2
+ self.Mconv1_stage2 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage2 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage3
+ self.Mconv1_stage3 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage3 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage4
+ self.Mconv1_stage4 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage4 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage5
+ self.Mconv1_stage5 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage5 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage6
+ self.Mconv1_stage6 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage6 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ for m in self.modules():
+ if isinstance(m, Conv2d):
+ init.constant_(m.bias, 0)
+
+ def forward(self, x):
+ """Return a list of heatmaps."""
+ heatmaps = []
+
+ h = self.relu(self.conv1_1(x))
+ h = self.relu(self.conv1_2(h))
+ h = self.max_pooling_2d(h)
+ h = self.relu(self.conv2_1(h))
+ h = self.relu(self.conv2_2(h))
+ h = self.max_pooling_2d(h)
+ h = self.relu(self.conv3_1(h))
+ h = self.relu(self.conv3_2(h))
+ h = self.relu(self.conv3_3(h))
+ h = self.relu(self.conv3_4(h))
+ h = self.max_pooling_2d(h)
+ h = self.relu(self.conv4_1(h))
+ h = self.relu(self.conv4_2(h))
+ h = self.relu(self.conv4_3(h))
+ h = self.relu(self.conv4_4(h))
+ h = self.relu(self.conv5_1(h))
+ h = self.relu(self.conv5_2(h))
+ h = self.relu(self.conv5_3_CPM(h))
+ feature_map = h
+
+ # stage1
+ h = self.relu(self.conv6_1_CPM(h))
+ h = self.conv6_2_CPM(h)
+ heatmaps.append(h)
+
+ # stage2
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage2(h))
+ h = self.relu(self.Mconv2_stage2(h))
+ h = self.relu(self.Mconv3_stage2(h))
+ h = self.relu(self.Mconv4_stage2(h))
+ h = self.relu(self.Mconv5_stage2(h))
+ h = self.relu(self.Mconv6_stage2(h))
+ h = self.Mconv7_stage2(h)
+ heatmaps.append(h)
+
+ # stage3
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage3(h))
+ h = self.relu(self.Mconv2_stage3(h))
+ h = self.relu(self.Mconv3_stage3(h))
+ h = self.relu(self.Mconv4_stage3(h))
+ h = self.relu(self.Mconv5_stage3(h))
+ h = self.relu(self.Mconv6_stage3(h))
+ h = self.Mconv7_stage3(h)
+ heatmaps.append(h)
+
+ # stage4
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage4(h))
+ h = self.relu(self.Mconv2_stage4(h))
+ h = self.relu(self.Mconv3_stage4(h))
+ h = self.relu(self.Mconv4_stage4(h))
+ h = self.relu(self.Mconv5_stage4(h))
+ h = self.relu(self.Mconv6_stage4(h))
+ h = self.Mconv7_stage4(h)
+ heatmaps.append(h)
+
+ # stage5
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage5(h))
+ h = self.relu(self.Mconv2_stage5(h))
+ h = self.relu(self.Mconv3_stage5(h))
+ h = self.relu(self.Mconv4_stage5(h))
+ h = self.relu(self.Mconv5_stage5(h))
+ h = self.relu(self.Mconv6_stage5(h))
+ h = self.Mconv7_stage5(h)
+ heatmaps.append(h)
+
+ # stage6
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage6(h))
+ h = self.relu(self.Mconv2_stage6(h))
+ h = self.relu(self.Mconv3_stage6(h))
+ h = self.relu(self.Mconv4_stage6(h))
+ h = self.relu(self.Mconv5_stage6(h))
+ h = self.relu(self.Mconv6_stage6(h))
+ h = self.Mconv7_stage6(h)
+ heatmaps.append(h)
+
+ return heatmaps
+
+
+LOG = logging.getLogger(__name__)
+TOTEN = ToTensor()
+TOPIL = ToPILImage()
+
+
+params = {
+ 'gaussian_sigma': 2.5,
+ 'inference_img_size': 736, # 368, 736, 1312
+ 'heatmap_peak_thresh': 0.1,
+ 'crop_scale': 1.5,
+ 'line_indices': [
+ [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
+ [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
+ [13, 14], [14, 15], [15, 16],
+ [17, 18], [18, 19], [19, 20], [20, 21],
+ [22, 23], [23, 24], [24, 25], [25, 26],
+ [27, 28], [28, 29], [29, 30],
+ [31, 32], [32, 33], [33, 34], [34, 35],
+ [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
+ [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
+ [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
+ [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
+ [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
+ [66, 67], [67, 60]
+ ],
+}
+
+
+class Face(object):
+ """
+ The OpenPose face landmark detector model.
+
+ Args:
+ inference_size: set the size of the inference image size, suggested:
+ 368, 736, 1312, default 736
+ gaussian_sigma: blur the heatmaps, default 2.5
+ heatmap_peak_thresh: return landmark if over threshold, default 0.1
+
+ """
+ def __init__(self, face_model_path,
+ inference_size=None,
+ gaussian_sigma=None,
+ heatmap_peak_thresh=None):
+ self.inference_size = inference_size or params["inference_img_size"]
+ self.sigma = gaussian_sigma or params['gaussian_sigma']
+ self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
+ self.model = FaceNet()
+ self.model.load_state_dict(torch.load(face_model_path))
+ # if torch.cuda.is_available():
+ # self.model = self.model.cuda()
+ # print('cuda')
+ self.model.eval()
+
+ def __call__(self, face_img):
+ H, W, C = face_img.shape
+
+ w_size = 384
+ x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
+
+ x_data = x_data.to(self.cn_device)
+
+ with torch.no_grad():
+ hs = self.model(x_data[None, ...])
+ heatmaps = F.interpolate(
+ hs[-1],
+ (H, W),
+ mode='bilinear', align_corners=True).cpu().numpy()[0]
+ return heatmaps
+
+ def compute_peaks_from_heatmaps(self, heatmaps):
+ all_peaks = []
+ for part in range(heatmaps.shape[0]):
+ map_ori = heatmaps[part].copy()
+ binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
+
+ if np.sum(binary) == 0:
+ continue
+
+ positions = np.where(binary > 0.5)
+ intensities = map_ori[positions]
+ mi = np.argmax(intensities)
+ y, x = positions[0][mi], positions[1][mi]
+ all_peaks.append([x, y])
+
+ return np.array(all_peaks)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/hand.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea28f3cb1baf845086aad51d970fb89c00fc6c5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/hand.py
@@ -0,0 +1,94 @@
+import cv2
+import json
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from skimage.measure import label
+
+from .model import handpose_model
+from . import util
+
+class Hand(object):
+ def __init__(self, model_path):
+ self.model = handpose_model()
+ # if torch.cuda.is_available():
+ # self.model = self.model.cuda()
+ # print('cuda')
+ model_dict = util.transfer(self.model, torch.load(model_path))
+ self.model.load_state_dict(model_dict)
+ self.model.eval()
+
+ def __call__(self, oriImgRaw):
+ scale_search = [0.5, 1.0, 1.5, 2.0]
+ # scale_search = [0.5]
+ boxsize = 368
+ stride = 8
+ padValue = 128
+ thre = 0.05
+ multiplier = [x * boxsize for x in scale_search]
+
+ wsize = 128
+ heatmap_avg = np.zeros((wsize, wsize, 22))
+
+ Hr, Wr, Cr = oriImgRaw.shape
+
+ oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
+
+ for m in range(len(multiplier)):
+ scale = multiplier[m]
+ imageToTest = util.smart_resize(oriImg, (scale, scale))
+
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+ im = np.ascontiguousarray(im)
+
+ data = torch.from_numpy(im).float()
+ if torch.cuda.is_available():
+ data = data.cuda()
+
+ with torch.no_grad():
+ data = data.to(self.cn_device)
+ output = self.model(data).cpu().numpy()
+
+ # extract outputs, resize, and remove padding
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ heatmap = util.smart_resize(heatmap, (wsize, wsize))
+
+ heatmap_avg += heatmap / len(multiplier)
+
+ all_peaks = []
+ for part in range(21):
+ map_ori = heatmap_avg[:, :, part]
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+
+ if np.sum(binary) == 0:
+ all_peaks.append([0, 0])
+ continue
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
+ label_img[label_img != max_index] = 0
+ map_ori[label_img == 0] = 0
+
+ y, x = util.npmax(map_ori)
+ y = int(float(y) * float(Hr) / float(wsize))
+ x = int(float(x) * float(Wr) / float(wsize))
+ all_peaks.append([x, y])
+ return np.array(all_peaks)
+
+if __name__ == "__main__":
+ hand_estimation = Hand('../model/hand_pose_model.pth')
+
+ # test_image = '../images/hand.jpg'
+ test_image = '../images/hand.jpg'
+ oriImg = cv2.imread(test_image) # B,G,R order
+ peaks = hand_estimation(oriImg)
+ canvas = util.draw_handpose(oriImg, peaks, True)
+ cv2.imshow('', canvas)
+ cv2.waitKey(0)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f066d820ac7df36e39722da57932acd2ba01925
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/model.py
@@ -0,0 +1,218 @@
+import torch
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+def make_layers(block, no_relu_layers):
+ layers = []
+ for layer_name, v in block.items():
+ if 'pool' in layer_name:
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
+ padding=v[2])
+ layers.append((layer_name, layer))
+ else:
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
+ kernel_size=v[2], stride=v[3],
+ padding=v[4])
+ layers.append((layer_name, conv2d))
+ if layer_name not in no_relu_layers:
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
+
+ return nn.Sequential(OrderedDict(layers))
+
+class bodypose_model(nn.Module):
+ def __init__(self):
+ super(bodypose_model, self).__init__()
+
+ # these layers have no relu layer
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
+ blocks = {}
+ block0 = OrderedDict([
+ ('conv1_1', [3, 64, 3, 1, 1]),
+ ('conv1_2', [64, 64, 3, 1, 1]),
+ ('pool1_stage1', [2, 2, 0]),
+ ('conv2_1', [64, 128, 3, 1, 1]),
+ ('conv2_2', [128, 128, 3, 1, 1]),
+ ('pool2_stage1', [2, 2, 0]),
+ ('conv3_1', [128, 256, 3, 1, 1]),
+ ('conv3_2', [256, 256, 3, 1, 1]),
+ ('conv3_3', [256, 256, 3, 1, 1]),
+ ('conv3_4', [256, 256, 3, 1, 1]),
+ ('pool3_stage1', [2, 2, 0]),
+ ('conv4_1', [256, 512, 3, 1, 1]),
+ ('conv4_2', [512, 512, 3, 1, 1]),
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
+ ])
+
+
+ # Stage 1
+ block1_1 = OrderedDict([
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
+ ])
+
+ block1_2 = OrderedDict([
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
+ ])
+ blocks['block1_1'] = block1_1
+ blocks['block1_2'] = block1_2
+
+ self.model0 = make_layers(block0, no_relu_layers)
+
+ # Stages 2 - 6
+ for i in range(2, 7):
+ blocks['block%d_1' % i] = OrderedDict([
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+ ])
+
+ blocks['block%d_2' % i] = OrderedDict([
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+ ])
+
+ for k in blocks.keys():
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+ self.model1_1 = blocks['block1_1']
+ self.model2_1 = blocks['block2_1']
+ self.model3_1 = blocks['block3_1']
+ self.model4_1 = blocks['block4_1']
+ self.model5_1 = blocks['block5_1']
+ self.model6_1 = blocks['block6_1']
+
+ self.model1_2 = blocks['block1_2']
+ self.model2_2 = blocks['block2_2']
+ self.model3_2 = blocks['block3_2']
+ self.model4_2 = blocks['block4_2']
+ self.model5_2 = blocks['block5_2']
+ self.model6_2 = blocks['block6_2']
+
+
+ def forward(self, x):
+
+ out1 = self.model0(x)
+
+ out1_1 = self.model1_1(out1)
+ out1_2 = self.model1_2(out1)
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+ out2_1 = self.model2_1(out2)
+ out2_2 = self.model2_2(out2)
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+ out3_1 = self.model3_1(out3)
+ out3_2 = self.model3_2(out3)
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+ out4_1 = self.model4_1(out4)
+ out4_2 = self.model4_2(out4)
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+ out5_1 = self.model5_1(out5)
+ out5_2 = self.model5_2(out5)
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+ out6_1 = self.model6_1(out6)
+ out6_2 = self.model6_2(out6)
+
+ return out6_1, out6_2
+
+class handpose_model(nn.Module):
+ def __init__(self):
+ super(handpose_model, self).__init__()
+
+ # these layers have no relu layer
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
+ # stage 1
+ block1_0 = OrderedDict([
+ ('conv1_1', [3, 64, 3, 1, 1]),
+ ('conv1_2', [64, 64, 3, 1, 1]),
+ ('pool1_stage1', [2, 2, 0]),
+ ('conv2_1', [64, 128, 3, 1, 1]),
+ ('conv2_2', [128, 128, 3, 1, 1]),
+ ('pool2_stage1', [2, 2, 0]),
+ ('conv3_1', [128, 256, 3, 1, 1]),
+ ('conv3_2', [256, 256, 3, 1, 1]),
+ ('conv3_3', [256, 256, 3, 1, 1]),
+ ('conv3_4', [256, 256, 3, 1, 1]),
+ ('pool3_stage1', [2, 2, 0]),
+ ('conv4_1', [256, 512, 3, 1, 1]),
+ ('conv4_2', [512, 512, 3, 1, 1]),
+ ('conv4_3', [512, 512, 3, 1, 1]),
+ ('conv4_4', [512, 512, 3, 1, 1]),
+ ('conv5_1', [512, 512, 3, 1, 1]),
+ ('conv5_2', [512, 512, 3, 1, 1]),
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
+ ])
+
+ block1_1 = OrderedDict([
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
+ ])
+
+ blocks = {}
+ blocks['block1_0'] = block1_0
+ blocks['block1_1'] = block1_1
+
+ # stage 2-6
+ for i in range(2, 7):
+ blocks['block%d' % i] = OrderedDict([
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
+ ])
+
+ for k in blocks.keys():
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+ self.model1_0 = blocks['block1_0']
+ self.model1_1 = blocks['block1_1']
+ self.model2 = blocks['block2']
+ self.model3 = blocks['block3']
+ self.model4 = blocks['block4']
+ self.model5 = blocks['block5']
+ self.model6 = blocks['block6']
+
+ def forward(self, x):
+ out1_0 = self.model1_0(x)
+ out1_1 = self.model1_1(out1_0)
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
+ out_stage2 = self.model2(concat_stage2)
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+ out_stage3 = self.model3(concat_stage3)
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+ out_stage4 = self.model4(concat_stage4)
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+ out_stage5 = self.model5(concat_stage5)
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+ out_stage6 = self.model6(concat_stage6)
+ return out_stage6
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/types.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8641216609cc2d5da436835158282a24cd5b11a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/types.py
@@ -0,0 +1,30 @@
+from typing import NamedTuple, List, Optional
+
+class Keypoint(NamedTuple):
+ x: float
+ y: float
+ score: float = 1.0
+ id: int = -1
+
+
+class BodyResult(NamedTuple):
+ # Note: Using `Optional` instead of `|` operator as the ladder is a Python
+ # 3.10 feature.
+ # Annotator code should be Python 3.8 Compatible, as controlnet repo uses
+ # Python 3.8 environment.
+ # https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
+ keypoints: List[Optional[Keypoint]]
+ total_score: float = 0.0
+ total_parts: int = 0
+
+
+HandResult = List[Keypoint]
+FaceResult = List[Keypoint]
+AnimalPoseResult = List[Keypoint]
+
+
+class PoseResult(NamedTuple):
+ body: BodyResult
+ left_hand: Optional[HandResult]
+ right_hand: Optional[HandResult]
+ face: Optional[FaceResult]
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b246bb1a8041d4303cebb0e35cd35bfb2fbaf3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/util.py
@@ -0,0 +1,466 @@
+import math
+import numpy as np
+import matplotlib
+import cv2
+import os
+from typing import List, Tuple, Union, Optional
+
+from .body import BodyResult, Keypoint
+
+eps = 0.01
+
+
+def smart_resize(x, s):
+ Ht, Wt = s
+ if x.ndim == 2:
+ Ho, Wo = x.shape
+ Co = 1
+ else:
+ Ho, Wo, Co = x.shape
+ if Co == 3 or Co == 1:
+ k = float(Ht + Wt) / float(Ho + Wo)
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+ else:
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
+
+
+def smart_resize_k(x, fx, fy):
+ if x.ndim == 2:
+ Ho, Wo = x.shape
+ Co = 1
+ else:
+ Ho, Wo, Co = x.shape
+ Ht, Wt = Ho * fy, Wo * fx
+ if Co == 3 or Co == 1:
+ k = float(Ht + Wt) / float(Ho + Wo)
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+ else:
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
+
+
+def padRightDownCorner(img, stride, padValue):
+ h = img.shape[0]
+ w = img.shape[1]
+
+ pad = 4 * [None]
+ pad[0] = 0 # up
+ pad[1] = 0 # left
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
+
+ img_padded = img
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+ return img_padded, pad
+
+
+def transfer(model, model_weights):
+ transfered_model_weights = {}
+ for weights_name in model.state_dict().keys():
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
+ return transfered_model_weights
+
+
+def is_normalized(keypoints: List[Optional[Keypoint]]) -> bool:
+ point_normalized = [
+ 0 <= abs(k.x) <= 1 and 0 <= abs(k.y) <= 1
+ for k in keypoints
+ if k is not None
+ ]
+ if not point_normalized:
+ return False
+ return all(point_normalized)
+
+
+def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint], xinsr_stick_scaling: bool = False) -> np.ndarray:
+ """
+ Draw keypoints and limbs representing body pose on a given canvas.
+
+ Args:
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
+ keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
+ xinsr_stick_scaling (bool): Whether or not scaling stick width for xinsr ControlNet
+
+ Returns:
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
+
+ Note:
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
+ """
+ if not is_normalized(keypoints):
+ H, W = 1.0, 1.0
+ else:
+ H, W, _ = canvas.shape
+
+ CH, CW, _ = canvas.shape
+ stickwidth = 4
+
+ # Ref: https://huggingface.co/xinsir/controlnet-openpose-sdxl-1.0
+ max_side = max(CW, CH)
+ if xinsr_stick_scaling:
+ stick_scale = 1 if max_side < 500 else min(2 + (max_side // 1000), 7)
+ else:
+ stick_scale = 1
+
+ limbSeq = [
+ [2, 3], [2, 6], [3, 4], [4, 5],
+ [6, 7], [7, 8], [2, 9], [9, 10],
+ [10, 11], [2, 12], [12, 13], [13, 14],
+ [2, 1], [1, 15], [15, 17], [1, 16],
+ [16, 18],
+ ]
+
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+
+ for (k1_index, k2_index), color in zip(limbSeq, colors):
+ keypoint1 = keypoints[k1_index - 1]
+ keypoint2 = keypoints[k2_index - 1]
+
+ if keypoint1 is None or keypoint2 is None:
+ continue
+
+ Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
+ X = np.array([keypoint1.y, keypoint2.y]) * float(H)
+ mX = np.mean(X)
+ mY = np.mean(Y)
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth*stick_scale), int(angle), 0, 360, 1)
+ cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
+
+ for keypoint, color in zip(keypoints, colors):
+ if keypoint is None:
+ continue
+
+ x, y = keypoint.x, keypoint.y
+ x = int(x * W)
+ y = int(y * H)
+ cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
+
+ return canvas
+
+
+def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
+ """
+ Draw keypoints and connections representing hand pose on a given canvas.
+
+ Args:
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
+ or None if no keypoints are present.
+
+ Returns:
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
+
+ Note:
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
+ """
+ if not keypoints:
+ return canvas
+
+ if not is_normalized(keypoints):
+ H, W = 1.0, 1.0
+ else:
+ H, W, _ = canvas.shape
+
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+
+ for ie, (e1, e2) in enumerate(edges):
+ k1 = keypoints[e1]
+ k2 = keypoints[e2]
+ if k1 is None or k2 is None:
+ continue
+
+ x1 = int(k1.x * W)
+ y1 = int(k1.y * H)
+ x2 = int(k2.x * W)
+ y2 = int(k2.y * H)
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
+
+ for keypoint in keypoints:
+ if keypoint is None:
+ continue
+
+ x, y = keypoint.x, keypoint.y
+ x = int(x * W)
+ y = int(y * H)
+ if x > eps and y > eps:
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+ return canvas
+
+
+def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
+ """
+ Draw keypoints representing face pose on a given canvas.
+
+ Args:
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
+ or None if no keypoints are present.
+
+ Returns:
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
+
+ Note:
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
+ """
+ if not keypoints:
+ return canvas
+
+ if not is_normalized(keypoints):
+ H, W = 1.0, 1.0
+ else:
+ H, W, _ = canvas.shape
+
+ for keypoint in keypoints:
+ if keypoint is None:
+ continue
+
+ x, y = keypoint.x, keypoint.y
+ x = int(x * W)
+ y = int(y * H)
+ if x > eps and y > eps:
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
+ return canvas
+
+
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
+ """
+ Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
+
+ Args:
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
+
+ Returns:
+ List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
+ corner of the bounding box, the width (height) of the bounding box, and
+ a boolean flag indicating whether the hand is a left hand (True) or a
+ right hand (False).
+
+ Notes:
+ - The width and height of the bounding boxes are equal since the network requires squared input.
+ - The minimum bounding box size is 20 pixels.
+ """
+ ratioWristElbow = 0.33
+ detect_result = []
+ image_height, image_width = oriImg.shape[0:2]
+
+ keypoints = body.keypoints
+ # right hand: wrist 4, elbow 3, shoulder 2
+ # left hand: wrist 7, elbow 6, shoulder 5
+ left_shoulder = keypoints[5]
+ left_elbow = keypoints[6]
+ left_wrist = keypoints[7]
+ right_shoulder = keypoints[2]
+ right_elbow = keypoints[3]
+ right_wrist = keypoints[4]
+
+ # if any of three not detected
+ has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
+ has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
+ if not (has_left or has_right):
+ return []
+
+ hands = []
+ #left hand
+ if has_left:
+ hands.append([
+ left_shoulder.x, left_shoulder.y,
+ left_elbow.x, left_elbow.y,
+ left_wrist.x, left_wrist.y,
+ True
+ ])
+ # right hand
+ if has_right:
+ hands.append([
+ right_shoulder.x, right_shoulder.y,
+ right_elbow.x, right_elbow.y,
+ right_wrist.x, right_wrist.y,
+ False
+ ])
+
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+ x = x3 + ratioWristElbow * (x3 - x2)
+ y = y3 + ratioWristElbow * (y3 - y2)
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+ # x-y refers to the center --> offset to topLeft point
+ # handRectangle.x -= handRectangle.width / 2.f;
+ # handRectangle.y -= handRectangle.height / 2.f;
+ x -= width / 2
+ y -= width / 2 # width = height
+ # overflow the image
+ if x < 0: x = 0
+ if y < 0: y = 0
+ width1 = width
+ width2 = width
+ if x + width > image_width: width1 = image_width - x
+ if y + width > image_height: width2 = image_height - y
+ width = min(width1, width2)
+ # the max hand box value is 20 pixels
+ if width >= 20:
+ detect_result.append((int(x), int(y), int(width), is_left))
+
+ '''
+ return value: [[x, y, w, True if left hand else False]].
+ width=height since the network require squared input.
+ x, y is the coordinate of top left
+ '''
+ return detect_result
+
+
+# Written by Lvmin
+def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
+ """
+ Detect the face in the input body pose keypoints and calculate the bounding box for the face.
+
+ Args:
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
+
+ Returns:
+ Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
+ bounding box and the width (height) of the bounding box, or None if the
+ face is not detected or the bounding box width is less than 20 pixels.
+
+ Notes:
+ - The width and height of the bounding box are equal.
+ - The minimum bounding box size is 20 pixels.
+ """
+ # left right eye ear 14 15 16 17
+ image_height, image_width = oriImg.shape[0:2]
+
+ keypoints = body.keypoints
+ head = keypoints[0]
+ left_eye = keypoints[14]
+ right_eye = keypoints[15]
+ left_ear = keypoints[16]
+ right_ear = keypoints[17]
+
+ if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
+ return None
+
+ width = 0.0
+ x0, y0 = head.x, head.y
+
+ if left_eye is not None:
+ x1, y1 = left_eye.x, left_eye.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 3.0)
+
+ if right_eye is not None:
+ x1, y1 = right_eye.x, right_eye.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 3.0)
+
+ if left_ear is not None:
+ x1, y1 = left_ear.x, left_ear.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 1.5)
+
+ if right_ear is not None:
+ x1, y1 = right_ear.x, right_ear.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 1.5)
+
+ x, y = x0, y0
+
+ x -= width
+ y -= width
+
+ if x < 0:
+ x = 0
+
+ if y < 0:
+ y = 0
+
+ width1 = width * 2
+ width2 = width * 2
+
+ if x + width > image_width:
+ width1 = image_width - x
+
+ if y + width > image_height:
+ width2 = image_height - y
+
+ width = min(width1, width2)
+
+ if width >= 20:
+ return int(x), int(y), int(width)
+ else:
+ return None
+
+
+# get max index of 2d array
+def npmax(array):
+ arrayindex = array.argmax(1)
+ arrayvalue = array.max(1)
+ i = arrayvalue.argmax()
+ j = arrayindex[i]
+ return i, j
+
+def guess_onnx_input_shape_dtype(filename):
+ dtype = np.float32
+ if "fp16" in filename:
+ dtype = np.float16
+ elif "int8" in filename:
+ dtype = np.uint8
+ input_size = (640, 640) if "yolo" in filename else (192, 256)
+ if "384" in filename:
+ input_size = (288, 384)
+ elif "256" in filename:
+ input_size = (256, 256)
+ return input_size, dtype
+
+if os.getenv('AUX_ORT_PROVIDERS'):
+ ONNX_PROVIDERS = os.getenv('AUX_ORT_PROVIDERS').split(',')
+else:
+ ONNX_PROVIDERS = ["CUDAExecutionProvider", "DirectMLExecutionProvider", "OpenVINOExecutionProvider", "ROCMExecutionProvider", "CPUExecutionProvider"]
+def get_ort_providers() -> List[str]:
+ providers = []
+ try:
+ import onnxruntime as ort
+ for provider in ONNX_PROVIDERS:
+ if provider in ort.get_available_providers():
+ providers.append(provider)
+ return providers
+ except:
+ return []
+
+def is_model_torchscript(model) -> bool:
+ return bool(type(model).__name__ == "RecursiveScriptModule")
+
+def get_model_type(Nodesname, filename) -> str:
+ ort_providers = list(filter(lambda x : x != "CPUExecutionProvider", get_ort_providers()))
+ if filename is None:
+ return None
+ elif ("onnx" in filename) and ort_providers:
+ print(f"{Nodesname}: Caching ONNXRuntime session {filename}...")
+ return "ort"
+ elif ("onnx" in filename):
+ print(f"{Nodesname}: Caching OpenCV DNN module {filename} on cv2.DNN...")
+ return "cv2"
+ else:
+ print(f"{Nodesname}: Caching TorchScript module {filename} on ...")
+ return "torchscript"
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/wholebody.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/wholebody.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0e158e13d1ce41e28124893fc2241bfdb2a4e91
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/dwpose/wholebody.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from .dw_onnx.cv_ox_det import inference_detector as inference_onnx_yolox
+from .dw_onnx.cv_ox_yolo_nas import inference_detector as inference_onnx_yolo_nas
+from .dw_onnx.cv_ox_pose import inference_pose as inference_onnx_pose
+
+from .dw_torchscript.jit_det import inference_detector as inference_jit_yolox
+from .dw_torchscript.jit_pose import inference_pose as inference_jit_pose
+
+from typing import List, Optional
+from .types import PoseResult, BodyResult, Keypoint
+from timeit import default_timer
+import os
+from custom_controlnet_aux.dwpose.util import guess_onnx_input_shape_dtype, get_model_type, get_ort_providers, is_model_torchscript
+import torch
+
+class Wholebody:
+ def __init__(self, det_model_path: Optional[str] = None, pose_model_path: Optional[str] = None, torchscript_device="cuda"):
+ self.det_filename = det_model_path and os.path.basename(det_model_path)
+ self.pose_filename = pose_model_path and os.path.basename(pose_model_path)
+ self.det, self.pose = None, None
+ # return type: None ort cv2 torchscript
+ self.det_model_type = get_model_type("DWPose",self.det_filename)
+ self.pose_model_type = get_model_type("DWPose",self.pose_filename)
+ # Always loads to CPU to avoid building OpenCV.
+ cv2_device = 'cpu'
+ cv2_backend = cv2.dnn.DNN_BACKEND_OPENCV if cv2_device == 'cpu' else cv2.dnn.DNN_BACKEND_CUDA
+ # You need to manually build OpenCV through cmake to work with your GPU.
+ cv2_providers = cv2.dnn.DNN_TARGET_CPU if cv2_device == 'cpu' else cv2.dnn.DNN_TARGET_CUDA
+ ort_providers = get_ort_providers()
+
+ if self.det_model_type is None:
+ pass
+ elif self.det_model_type == "ort":
+ try:
+ import onnxruntime as ort
+ self.det = ort.InferenceSession(det_model_path, providers=ort_providers)
+ except:
+ print(f"Failed to load onnxruntime with {self.det.get_providers()}.\nPlease change EP_list in the config.yaml and restart ComfyUI")
+ self.det = ort.InferenceSession(det_model_path, providers=["CPUExecutionProvider"])
+ elif self.det_model_type == "cv2":
+ try:
+ self.det = cv2.dnn.readNetFromONNX(det_model_path)
+ self.det.setPreferableBackend(cv2_backend)
+ self.det.setPreferableTarget(cv2_providers)
+ except:
+ print("TopK operators may not work on your OpenCV, try use onnxruntime with CPUExecutionProvider")
+ try:
+ import onnxruntime as ort
+ self.det = ort.InferenceSession(det_model_path, providers=["CPUExecutionProvider"])
+ except:
+ print(f"Failed to load {det_model_path}, you can use other models instead")
+ else:
+ self.det = torch.jit.load(det_model_path)
+ self.det.to(torchscript_device)
+
+ if self.pose_model_type is None:
+ pass
+ elif self.pose_model_type == "ort":
+ try:
+ import onnxruntime as ort
+ self.pose = ort.InferenceSession(pose_model_path, providers=ort_providers)
+ except:
+ print(f"Failed to load onnxruntime with {self.pose.get_providers()}.\nPlease change EP_list in the config.yaml and restart ComfyUI")
+ self.pose = ort.InferenceSession(pose_model_path, providers=["CPUExecutionProvider"])
+ elif self.pose_model_type == "cv2":
+ self.pose = cv2.dnn.readNetFromONNX(pose_model_path)
+ self.pose.setPreferableBackend(cv2_backend)
+ self.pose.setPreferableTarget(cv2_providers)
+ else:
+ self.pose = torch.jit.load(pose_model_path)
+ self.pose.to(torchscript_device)
+
+ if self.pose_filename is not None:
+ self.pose_input_size, _ = guess_onnx_input_shape_dtype(self.pose_filename)
+
+ def __call__(self, oriImg) -> Optional[np.ndarray]:
+ #Sacrifice accurate time measurement for compatibility
+ det_start = default_timer()
+ if is_model_torchscript(self.det):
+ det_result = inference_jit_yolox(self.det, oriImg, detect_classes=[0])
+ else:
+ if "yolox" in self.det_filename:
+ det_result = inference_onnx_yolox(self.det, oriImg, detect_classes=[0], dtype=np.float32)
+ else:
+ #FP16 and INT8 YOLO NAS accept uint8 input
+ det_result = inference_onnx_yolo_nas(self.det, oriImg, detect_classes=[0], dtype=np.uint8)
+ print(f"DWPose: Bbox {((default_timer() - det_start) * 1000):.2f}ms")
+ if (det_result is None) or (det_result.shape[0] == 0):
+ return None
+
+ pose_start = default_timer()
+ if is_model_torchscript(self.pose):
+ keypoints, scores = inference_jit_pose(self.pose, det_result, oriImg, self.pose_input_size)
+ else:
+ _, pose_onnx_dtype = guess_onnx_input_shape_dtype(self.pose_filename)
+ keypoints, scores = inference_onnx_pose(self.pose, det_result, oriImg, self.pose_input_size, dtype=pose_onnx_dtype)
+ print(f"DWPose: Pose {((default_timer() - pose_start) * 1000):.2f}ms on {det_result.shape[0]} people\n")
+
+ keypoints_info = np.concatenate(
+ (keypoints, scores[..., None]), axis=-1)
+ # compute neck joint
+ neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
+ # neck score when visualizing pred
+ neck[:, 2:4] = np.logical_and(
+ keypoints_info[:, 5, 2:4] > 0.3,
+ keypoints_info[:, 6, 2:4] > 0.3).astype(int)
+ new_keypoints_info = np.insert(
+ keypoints_info, 17, neck, axis=1)
+ mmpose_idx = [
+ 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
+ ]
+ openpose_idx = [
+ 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
+ ]
+ new_keypoints_info[:, openpose_idx] = \
+ new_keypoints_info[:, mmpose_idx]
+ keypoints_info = new_keypoints_info
+
+ return keypoints_info
+
+ @staticmethod
+ def format_result(keypoints_info: Optional[np.ndarray]) -> List[PoseResult]:
+ def format_keypoint_part(
+ part: np.ndarray,
+ ) -> Optional[List[Optional[Keypoint]]]:
+ keypoints = [
+ Keypoint(x, y, score, i) if score >= 0.3 else None
+ for i, (x, y, score) in enumerate(part)
+ ]
+ return (
+ None if all(keypoint is None for keypoint in keypoints) else keypoints
+ )
+
+ def total_score(keypoints: Optional[List[Optional[Keypoint]]]) -> float:
+ return (
+ sum(keypoint.score for keypoint in keypoints if keypoint is not None)
+ if keypoints is not None
+ else 0.0
+ )
+
+ pose_results = []
+ if keypoints_info is None:
+ return pose_results
+
+ for instance in keypoints_info:
+ body_keypoints = format_keypoint_part(instance[:18]) or ([None] * 18)
+ left_hand = format_keypoint_part(instance[92:113])
+ right_hand = format_keypoint_part(instance[113:134])
+ face = format_keypoint_part(instance[24:92])
+
+ # Openpose face consists of 70 points in total, while DWPose only
+ # provides 68 points. Padding the last 2 points.
+ if face is not None:
+ # left eye
+ face.append(body_keypoints[14])
+ # right eye
+ face.append(body_keypoints[15])
+
+ body = BodyResult(
+ body_keypoints, total_score(body_keypoints), len(body_keypoints)
+ )
+ pose_results.append(PoseResult(body, left_hand, right_hand, face))
+
+ return pose_results
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/hed/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/hed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4a3db0d1cd7d8ebafefca05c6261bc8d250a16b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/hed/__init__.py
@@ -0,0 +1,110 @@
+# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+
+import os
+import warnings
+
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, nms, resize_image_with_pad, safe_step, common_input_validate, custom_hf_download, HF_MODEL_NAME
+
+
+class DoubleConvBlock(torch.nn.Module):
+ def __init__(self, input_channel, output_channel, layer_number):
+ super().__init__()
+ self.convs = torch.nn.Sequential()
+ self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+ for i in range(1, layer_number):
+ self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+ self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
+
+ def __call__(self, x, down_sampling=False):
+ h = x
+ if down_sampling:
+ h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
+ for conv in self.convs:
+ h = conv(h)
+ h = torch.nn.functional.relu(h)
+ return h, self.projection(h)
+
+
+class ControlNetHED_Apache2(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+ self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
+ self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
+ self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
+ self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
+ self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
+
+ def __call__(self, x):
+ h = x - self.norm
+ h, projection1 = self.block1(h)
+ h, projection2 = self.block2(h, down_sampling=True)
+ h, projection3 = self.block3(h, down_sampling=True)
+ h, projection4 = self.block4(h, down_sampling=True)
+ h, projection5 = self.block5(h, down_sampling=True)
+ return projection1, projection2, projection3, projection4, projection5
+
+class HEDdetector:
+ def __init__(self, netNetwork):
+ self.netNetwork = netNetwork
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="ControlNetHED.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ netNetwork = ControlNetHED_Apache2()
+ netNetwork.load_state_dict(torch.load(model_path, map_location='cpu'))
+ netNetwork.float().eval()
+
+ return cls(netNetwork)
+
+ def to(self, device):
+ self.netNetwork.to(device)
+ self.device = device
+ return self
+
+
+ def __call__(self, input_image, detect_resolution=512, safe=False, output_type="pil", scribble=False, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ assert input_image.ndim == 3
+ H, W, C = input_image.shape
+ with torch.no_grad():
+ image_hed = torch.from_numpy(input_image).float().to(self.device)
+ image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
+ edges = self.netNetwork(image_hed)
+ edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
+ edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
+ edges = np.stack(edges, axis=2)
+ edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+ if safe:
+ edge = safe_step(edge)
+ edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = edge
+
+ if scribble:
+ detected_map = nms(detected_map, 127, 3.0)
+ detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
+ detected_map[detected_map > 4] = 255
+ detected_map[detected_map < 255] = 0
+
+ detected_map = HWC3(remove_pad(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f99943bf5ddd1e9e2adcb0169360cc7e31c037
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/__init__.py
@@ -0,0 +1,93 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+from .leres.depthmap import estimateboost, estimateleres
+from .leres.multi_depth_model_woauxi import RelDepthModel
+from .leres.net_tools import strip_prefix_if_present
+from .pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
+from .pix2pix.options.test_options import TestOptions
+
+
+class LeresDetector:
+ def __init__(self, model, pix2pixmodel):
+ self.model = model
+ self.pix2pixmodel = pix2pixmodel
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="res101.pth", pix2pix_filename="latest_net_G.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
+
+ model = RelDepthModel(backbone='resnext101')
+ model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
+ del checkpoint
+
+ pix2pix_model_path = custom_hf_download(pretrained_model_or_path, pix2pix_filename)
+
+ opt = TestOptions().parse()
+ if not torch.cuda.is_available():
+ opt.gpu_ids = [] # cpu mode
+ pix2pixmodel = Pix2Pix4DepthModel(opt)
+ pix2pixmodel.save_dir = os.path.dirname(pix2pix_model_path)
+ pix2pixmodel.load_networks('latest')
+ pix2pixmodel.eval()
+
+ return cls(model, pix2pixmodel)
+
+ def to(self, device):
+ self.model.to(device)
+ # TODO - refactor pix2pix implementation to support device migration
+ # self.pix2pixmodel.to(device)
+ return self
+
+ def __call__(self, input_image, thr_a=0, thr_b=0, boost=False, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ with torch.no_grad():
+ if boost:
+ depth = estimateboost(detected_map, self.model, 0, self.pix2pixmodel, max(detected_map.shape[1], detected_map.shape[0]))
+ else:
+ depth = estimateleres(detected_map, self.model, detected_map.shape[1], detected_map.shape[0])
+
+ numbytes=2
+ depth_min = depth.min()
+ depth_max = depth.max()
+ max_val = (2**(8*numbytes))-1
+
+ # check output before normalizing and mapping to 16 bit
+ if depth_max - depth_min > np.finfo("float").eps:
+ out = max_val * (depth - depth_min) / (depth_max - depth_min)
+ else:
+ out = np.zeros(depth.shape)
+
+ # single channel, 16 bit image
+ depth_image = out.astype("uint16")
+
+ # convert to uint8
+ depth_image = cv2.convertScaleAbs(depth_image, alpha=(255.0/65535.0))
+
+ # remove near
+ if thr_a != 0:
+ thr_a = ((thr_a/100)*255)
+ depth_image = cv2.threshold(depth_image, thr_a, 255, cv2.THRESH_TOZERO)[1]
+
+ # invert image
+ depth_image = cv2.bitwise_not(depth_image)
+
+ # remove bg
+ if thr_b != 0:
+ thr_b = ((thr_b/100)*255)
+ depth_image = cv2.threshold(depth_image, thr_b, 255, cv2.THRESH_TOZERO)[1]
+
+ detected_map = HWC3(remove_pad(depth_image))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..bb60b5b7a28893d1f43e79c09f7dd507cee5bf4d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/LICENSE
@@ -0,0 +1,23 @@
+https://github.com/thygate/stable-diffusion-webui-depthmap-script
+
+MIT License
+
+Copyright (c) 2023 Bob Thiry
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/Resnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/Resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d19038d217c7251ae516bf43f66f9e25c4b040c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/Resnet.py
@@ -0,0 +1,199 @@
+import torch.nn as nn
+import torch.nn as NN
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+ 'resnet152']
+
+
+model_urls = {
+ 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+ 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+ 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+ 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+ 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+ """3x3 convolution with padding"""
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+ padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
+ super(BasicBlock, self).__init__()
+ self.conv1 = conv3x3(inplanes, planes, stride)
+ self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+ self.relu = nn.ReLU(inplace=True)
+ self.conv2 = conv3x3(planes, planes)
+ self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class Bottleneck(nn.Module):
+ expansion = 4
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
+ super(Bottleneck, self).__init__()
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+ self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+ padding=1, bias=False)
+ self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+ self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = NN.BatchNorm2d(planes * self.expansion) #NN.BatchNorm2d
+ self.relu = nn.ReLU(inplace=True)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class ResNet(nn.Module):
+
+ def __init__(self, block, layers, num_classes=1000):
+ self.inplanes = 64
+ super(ResNet, self).__init__()
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+ bias=False)
+ self.bn1 = NN.BatchNorm2d(64) #NN.BatchNorm2d
+ self.relu = nn.ReLU(inplace=True)
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+ self.layer1 = self._make_layer(block, 64, layers[0])
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+ #self.avgpool = nn.AvgPool2d(7, stride=1)
+ #self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+
+ def _make_layer(self, block, planes, blocks, stride=1):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(self.inplanes, planes * block.expansion,
+ kernel_size=1, stride=stride, bias=False),
+ NN.BatchNorm2d(planes * block.expansion), #NN.BatchNorm2d
+ )
+
+ layers = []
+ layers.append(block(self.inplanes, planes, stride, downsample))
+ self.inplanes = planes * block.expansion
+ for i in range(1, blocks):
+ layers.append(block(self.inplanes, planes))
+
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ features = []
+
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+
+ x = self.layer1(x)
+ features.append(x)
+ x = self.layer2(x)
+ features.append(x)
+ x = self.layer3(x)
+ features.append(x)
+ x = self.layer4(x)
+ features.append(x)
+
+ return features
+
+
+def resnet18(pretrained=True, **kwargs):
+ """Constructs a ResNet-18 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+ return model
+
+
+def resnet34(pretrained=True, **kwargs):
+ """Constructs a ResNet-34 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+ return model
+
+
+def resnet50(pretrained=True, **kwargs):
+ """Constructs a ResNet-50 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+
+ return model
+
+
+def resnet101(pretrained=True, **kwargs):
+ """Constructs a ResNet-101 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+
+ return model
+
+
+def resnet152(pretrained=True, **kwargs):
+ """Constructs a ResNet-152 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/Resnext_torch.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/Resnext_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb73dec40ba064b3941eea838d6a83d94033bbe
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/Resnext_torch.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+# coding: utf-8
+import torch.nn as nn
+
+try:
+ from urllib import urlretrieve
+except ImportError:
+ from urllib.request import urlretrieve
+
+__all__ = ['resnext101_32x8d']
+
+
+model_urls = {
+ 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+ 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+ """3x3 convolution with padding"""
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+ padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+ """1x1 convolution"""
+ return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+ base_width=64, dilation=1, norm_layer=None):
+ super(BasicBlock, self).__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+ if groups != 1 or base_width != 64:
+ raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+ if dilation > 1:
+ raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+ # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv3x3(inplanes, planes, stride)
+ self.bn1 = norm_layer(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.conv2 = conv3x3(planes, planes)
+ self.bn2 = norm_layer(planes)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class Bottleneck(nn.Module):
+ # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+ # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+ # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+ # This variant is also known as ResNet V1.5 and improves accuracy according to
+ # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+ expansion = 4
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+ base_width=64, dilation=1, norm_layer=None):
+ super(Bottleneck, self).__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+ width = int(planes * (base_width / 64.)) * groups
+ # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv1x1(inplanes, width)
+ self.bn1 = norm_layer(width)
+ self.conv2 = conv3x3(width, width, stride, groups, dilation)
+ self.bn2 = norm_layer(width)
+ self.conv3 = conv1x1(width, planes * self.expansion)
+ self.bn3 = norm_layer(planes * self.expansion)
+ self.relu = nn.ReLU(inplace=True)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class ResNet(nn.Module):
+
+ def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+ groups=1, width_per_group=64, replace_stride_with_dilation=None,
+ norm_layer=None):
+ super(ResNet, self).__init__()
+ if norm_layer is None:
+ norm_layer = nn.BatchNorm2d
+ self._norm_layer = norm_layer
+
+ self.inplanes = 64
+ self.dilation = 1
+ if replace_stride_with_dilation is None:
+ # each element in the tuple indicates if we should replace
+ # the 2x2 stride with a dilated convolution instead
+ replace_stride_with_dilation = [False, False, False]
+ if len(replace_stride_with_dilation) != 3:
+ raise ValueError("replace_stride_with_dilation should be None "
+ "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+ self.groups = groups
+ self.base_width = width_per_group
+ self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+ bias=False)
+ self.bn1 = norm_layer(self.inplanes)
+ self.relu = nn.ReLU(inplace=True)
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+ self.layer1 = self._make_layer(block, 64, layers[0])
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+ dilate=replace_stride_with_dilation[0])
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+ dilate=replace_stride_with_dilation[1])
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+ dilate=replace_stride_with_dilation[2])
+ #self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+ #self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+
+ # Zero-initialize the last BN in each residual branch,
+ # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+ # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+ if zero_init_residual:
+ for m in self.modules():
+ if isinstance(m, Bottleneck):
+ nn.init.constant_(m.bn3.weight, 0)
+ elif isinstance(m, BasicBlock):
+ nn.init.constant_(m.bn2.weight, 0)
+
+ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+ norm_layer = self._norm_layer
+ downsample = None
+ previous_dilation = self.dilation
+ if dilate:
+ self.dilation *= stride
+ stride = 1
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ conv1x1(self.inplanes, planes * block.expansion, stride),
+ norm_layer(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+ self.base_width, previous_dilation, norm_layer))
+ self.inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(block(self.inplanes, planes, groups=self.groups,
+ base_width=self.base_width, dilation=self.dilation,
+ norm_layer=norm_layer))
+
+ return nn.Sequential(*layers)
+
+ def _forward_impl(self, x):
+ # See note [TorchScript super()]
+ features = []
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+
+ x = self.layer1(x)
+ features.append(x)
+
+ x = self.layer2(x)
+ features.append(x)
+
+ x = self.layer3(x)
+ features.append(x)
+
+ x = self.layer4(x)
+ features.append(x)
+
+ #x = self.avgpool(x)
+ #x = torch.flatten(x, 1)
+ #x = self.fc(x)
+
+ return features
+
+ def forward(self, x):
+ return self._forward_impl(x)
+
+
+
+def resnext101_32x8d(pretrained=True, **kwargs):
+ """Constructs a ResNet-152 model.
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
+ """
+ kwargs['groups'] = 32
+ kwargs['width_per_group'] = 8
+
+ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+ return model
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/depthmap.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/depthmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..a86235c3da2bc65b1652df9eb94c87d9a1fe60ad
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/depthmap.py
@@ -0,0 +1,548 @@
+# Author: thygate
+# https://github.com/thygate/stable-diffusion-webui-depthmap-script
+
+import gc
+from operator import getitem
+
+import cv2
+import numpy as np
+import skimage.measure
+import torch
+from torchvision.transforms import transforms
+
+from ...util import torch_gc
+
+whole_size_threshold = 1600 # R_max from the paper
+pix2pixsize = 1024
+
+def scale_torch(img):
+ """
+ Scale the image and output it in torch.tensor.
+ :param img: input rgb is in shape [H, W, C], input depth/disp is in shape [H, W]
+ :param scale: the scale factor. float
+ :return: img. [C, H, W]
+ """
+ if len(img.shape) == 2:
+ img = img[np.newaxis, :, :]
+ if img.shape[2] == 3:
+ transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406) , (0.229, 0.224, 0.225) )])
+ img = transform(img.astype(np.float32))
+ else:
+ img = img.astype(np.float32)
+ img = torch.from_numpy(img)
+ return img
+
+def estimateleres(img, model, w, h):
+ device = next(iter(model.parameters())).device
+ # leres transform input
+ rgb_c = img[:, :, ::-1].copy()
+ A_resize = cv2.resize(rgb_c, (w, h))
+ img_torch = scale_torch(A_resize)[None, :, :, :]
+
+ # compute
+ with torch.no_grad():
+ img_torch = img_torch.to(device)
+ prediction = model.depth_model(img_torch)
+
+ prediction = prediction.squeeze().cpu().numpy()
+ prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
+
+ return prediction
+
+def generatemask(size):
+ # Generates a Guassian mask
+ mask = np.zeros(size, dtype=np.float32)
+ sigma = int(size[0]/16)
+ k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
+ mask[int(0.15*size[0]):size[0] - int(0.15*size[0]), int(0.15*size[1]): size[1] - int(0.15*size[1])] = 1
+ mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
+ mask = (mask - mask.min()) / (mask.max() - mask.min())
+ mask = mask.astype(np.float32)
+ return mask
+
+def resizewithpool(img, size):
+ i_size = img.shape[0]
+ n = int(np.floor(i_size/size))
+
+ out = skimage.measure.block_reduce(img, (n, n), np.max)
+ return out
+
+def rgb2gray(rgb):
+ # Converts rgb to gray
+ return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140])
+
+def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, whole_size_threshold=3000):
+ # Returns the R_x resolution described in section 5 of the main paper.
+
+ # Parameters:
+ # img :input rgb image
+ # basesize : size the dilation kernel which is equal to receptive field of the network.
+ # confidence: value of x in R_x; allowed percentage of pixels that are not getting any contextual cue.
+ # scale_threshold: maximum allowed upscaling on the input image ; it has been set to 3.
+ # whole_size_threshold: maximum allowed resolution. (R_max from section 6 of the main paper)
+
+ # Returns:
+ # outputsize_scale*speed_scale :The computed R_x resolution
+ # patch_scale: K parameter from section 6 of the paper
+
+ # speed scale parameter is to process every image in a smaller size to accelerate the R_x resolution search
+ speed_scale = 32
+ image_dim = int(min(img.shape[0:2]))
+
+ gray = rgb2gray(img)
+ grad = np.abs(cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)) + np.abs(cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3))
+ grad = cv2.resize(grad, (image_dim, image_dim), cv2.INTER_AREA)
+
+ # thresholding the gradient map to generate the edge-map as a proxy of the contextual cues
+ m = grad.min()
+ M = grad.max()
+ middle = m + (0.4 * (M - m))
+ grad[grad < middle] = 0
+ grad[grad >= middle] = 1
+
+ # dilation kernel with size of the receptive field
+ kernel = np.ones((int(basesize/speed_scale), int(basesize/speed_scale)), float)
+ # dilation kernel with size of the a quarter of receptive field used to compute k
+ # as described in section 6 of main paper
+ kernel2 = np.ones((int(basesize / (4*speed_scale)), int(basesize / (4*speed_scale))), float)
+
+ # Output resolution limit set by the whole_size_threshold and scale_threshold.
+ threshold = min(whole_size_threshold, scale_threshold * max(img.shape[:2]))
+
+ outputsize_scale = basesize / speed_scale
+ for p_size in range(int(basesize/speed_scale), int(threshold/speed_scale), int(basesize / (2*speed_scale))):
+ grad_resized = resizewithpool(grad, p_size)
+ grad_resized = cv2.resize(grad_resized, (p_size, p_size), cv2.INTER_NEAREST)
+ grad_resized[grad_resized >= 0.5] = 1
+ grad_resized[grad_resized < 0.5] = 0
+
+ dilated = cv2.dilate(grad_resized, kernel, iterations=1)
+ meanvalue = (1-dilated).mean()
+ if meanvalue > confidence:
+ break
+ else:
+ outputsize_scale = p_size
+
+ grad_region = cv2.dilate(grad_resized, kernel2, iterations=1)
+ patch_scale = grad_region.mean()
+
+ return int(outputsize_scale*speed_scale), patch_scale
+
+# Generate a double-input depth estimation
+def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel):
+ # Generate the low resolution estimation
+ estimate1 = singleestimate(img, size1, model, net_type)
+ # Resize to the inference size of merge network.
+ estimate1 = cv2.resize(estimate1, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+
+ # Generate the high resolution estimation
+ estimate2 = singleestimate(img, size2, model, net_type)
+ # Resize to the inference size of merge network.
+ estimate2 = cv2.resize(estimate2, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+
+ # Inference on the merge model
+ pix2pixmodel.set_input(estimate1, estimate2)
+ pix2pixmodel.test()
+ visuals = pix2pixmodel.get_current_visuals()
+ prediction_mapped = visuals['fake_B']
+ prediction_mapped = (prediction_mapped+1)/2
+ prediction_mapped = (prediction_mapped - torch.min(prediction_mapped)) / (
+ torch.max(prediction_mapped) - torch.min(prediction_mapped))
+ prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
+
+ return prediction_mapped
+
+# Generate a single-input depth estimation
+def singleestimate(img, msize, model, net_type):
+ # if net_type == 0:
+ return estimateleres(img, model, msize, msize)
+ # else:
+ # return estimatemidasBoost(img, model, msize, msize)
+
+def applyGridpatch(blsize, stride, img, box):
+ # Extract a simple grid patch.
+ counter1 = 0
+ patch_bound_list = {}
+ for k in range(blsize, img.shape[1] - blsize, stride):
+ for j in range(blsize, img.shape[0] - blsize, stride):
+ patch_bound_list[str(counter1)] = {}
+ patchbounds = [j - blsize, k - blsize, j - blsize + 2 * blsize, k - blsize + 2 * blsize]
+ patch_bound = [box[0] + patchbounds[1], box[1] + patchbounds[0], patchbounds[3] - patchbounds[1],
+ patchbounds[2] - patchbounds[0]]
+ patch_bound_list[str(counter1)]['rect'] = patch_bound
+ patch_bound_list[str(counter1)]['size'] = patch_bound[2]
+ counter1 = counter1 + 1
+ return patch_bound_list
+
+# Generating local patches to perform the local refinement described in section 6 of the main paper.
+def generatepatchs(img, base_size):
+
+ # Compute the gradients as a proxy of the contextual cues.
+ img_gray = rgb2gray(img)
+ whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) +\
+ np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3))
+
+ threshold = whole_grad[whole_grad > 0].mean()
+ whole_grad[whole_grad < threshold] = 0
+
+ # We use the integral image to speed-up the evaluation of the amount of gradients for each patch.
+ gf = whole_grad.sum()/len(whole_grad.reshape(-1))
+ grad_integral_image = cv2.integral(whole_grad)
+
+ # Variables are selected such that the initial patch size would be the receptive field size
+ # and the stride is set to 1/3 of the receptive field size.
+ blsize = int(round(base_size/2))
+ stride = int(round(blsize*0.75))
+
+ # Get initial Grid
+ patch_bound_list = applyGridpatch(blsize, stride, img, [0, 0, 0, 0])
+
+ # Refine initial Grid of patches by discarding the flat (in terms of gradients of the rgb image) ones. Refine
+ # each patch size to ensure that there will be enough depth cues for the network to generate a consistent depth map.
+ print("Selecting patches ...")
+ patch_bound_list = adaptiveselection(grad_integral_image, patch_bound_list, gf)
+
+ # Sort the patch list to make sure the merging operation will be done with the correct order: starting from biggest
+ # patch
+ patchset = sorted(patch_bound_list.items(), key=lambda x: getitem(x[1], 'size'), reverse=True)
+ return patchset
+
+def getGF_fromintegral(integralimage, rect):
+ # Computes the gradient density of a given patch from the gradient integral image.
+ x1 = rect[1]
+ x2 = rect[1]+rect[3]
+ y1 = rect[0]
+ y2 = rect[0]+rect[2]
+ value = integralimage[x2, y2]-integralimage[x1, y2]-integralimage[x2, y1]+integralimage[x1, y1]
+ return value
+
+# Adaptively select patches
+def adaptiveselection(integral_grad, patch_bound_list, gf):
+ patchlist = {}
+ count = 0
+ height, width = integral_grad.shape
+
+ search_step = int(32/factor)
+
+ # Go through all patches
+ for c in range(len(patch_bound_list)):
+ # Get patch
+ bbox = patch_bound_list[str(c)]['rect']
+
+ # Compute the amount of gradients present in the patch from the integral image.
+ cgf = getGF_fromintegral(integral_grad, bbox)/(bbox[2]*bbox[3])
+
+ # Check if patching is beneficial by comparing the gradient density of the patch to
+ # the gradient density of the whole image
+ if cgf >= gf:
+ bbox_test = bbox.copy()
+ patchlist[str(count)] = {}
+
+ # Enlarge each patch until the gradient density of the patch is equal
+ # to the whole image gradient density
+ while True:
+
+ bbox_test[0] = bbox_test[0] - int(search_step/2)
+ bbox_test[1] = bbox_test[1] - int(search_step/2)
+
+ bbox_test[2] = bbox_test[2] + search_step
+ bbox_test[3] = bbox_test[3] + search_step
+
+ # Check if we are still within the image
+ if bbox_test[0] < 0 or bbox_test[1] < 0 or bbox_test[1] + bbox_test[3] >= height \
+ or bbox_test[0] + bbox_test[2] >= width:
+ break
+
+ # Compare gradient density
+ cgf = getGF_fromintegral(integral_grad, bbox_test)/(bbox_test[2]*bbox_test[3])
+ if cgf < gf:
+ break
+ bbox = bbox_test.copy()
+
+ # Add patch to selected patches
+ patchlist[str(count)]['rect'] = bbox
+ patchlist[str(count)]['size'] = bbox[2]
+ count = count + 1
+
+ # Return selected patches
+ return patchlist
+
+def impatch(image, rect):
+ # Extract the given patch pixels from a given image.
+ w1 = rect[0]
+ h1 = rect[1]
+ w2 = w1 + rect[2]
+ h2 = h1 + rect[3]
+ image_patch = image[h1:h2, w1:w2]
+ return image_patch
+
+class ImageandPatchs:
+ def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
+ self.root_dir = root_dir
+ self.patchsinfo = patchsinfo
+ self.name = name
+ self.patchs = patchsinfo
+ self.scale = scale
+
+ self.rgb_image = cv2.resize(rgb_image, (round(rgb_image.shape[1]*scale), round(rgb_image.shape[0]*scale)),
+ interpolation=cv2.INTER_CUBIC)
+
+ self.do_have_estimate = False
+ self.estimation_updated_image = None
+ self.estimation_base_image = None
+
+ def __len__(self):
+ return len(self.patchs)
+
+ def set_base_estimate(self, est):
+ self.estimation_base_image = est
+ if self.estimation_updated_image is not None:
+ self.do_have_estimate = True
+
+ def set_updated_estimate(self, est):
+ self.estimation_updated_image = est
+ if self.estimation_base_image is not None:
+ self.do_have_estimate = True
+
+ def __getitem__(self, index):
+ patch_id = int(self.patchs[index][0])
+ rect = np.array(self.patchs[index][1]['rect'])
+ msize = self.patchs[index][1]['size']
+
+ ## applying scale to rect:
+ rect = np.round(rect * self.scale)
+ rect = rect.astype('int')
+ msize = round(msize * self.scale)
+
+ patch_rgb = impatch(self.rgb_image, rect)
+ if self.do_have_estimate:
+ patch_whole_estimate_base = impatch(self.estimation_base_image, rect)
+ patch_whole_estimate_updated = impatch(self.estimation_updated_image, rect)
+ return {'patch_rgb': patch_rgb, 'patch_whole_estimate_base': patch_whole_estimate_base,
+ 'patch_whole_estimate_updated': patch_whole_estimate_updated, 'rect': rect,
+ 'size': msize, 'id': patch_id}
+ else:
+ return {'patch_rgb': patch_rgb, 'rect': rect, 'size': msize, 'id': patch_id}
+
+ def print_options(self, opt):
+ """Print and save options
+
+ It will print both current options and default values(if different).
+ It will save options into a text file / [checkpoints_dir] / opt.txt
+ """
+ message = ''
+ message += '----------------- Options ---------------\n'
+ for k, v in sorted(vars(opt).items()):
+ comment = ''
+ default = self.parser.get_default(k)
+ if v != default:
+ comment = '\t[default: %s]' % str(default)
+ message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
+ message += '----------------- End -------------------'
+ print(message)
+
+ # save to the disk
+ """
+ expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
+ util.mkdirs(expr_dir)
+ file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
+ with open(file_name, 'wt') as opt_file:
+ opt_file.write(message)
+ opt_file.write('\n')
+ """
+
+ def parse(self):
+ """Parse our options, create checkpoints directory suffix, and set up gpu device."""
+ opt = self.gather_options()
+ opt.isTrain = self.isTrain # train or test
+
+ # process opt.suffix
+ if opt.suffix:
+ suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
+ opt.name = opt.name + suffix
+
+ #self.print_options(opt)
+
+ # set gpu ids
+ str_ids = opt.gpu_ids.split(',')
+ opt.gpu_ids = []
+ for str_id in str_ids:
+ id = int(str_id)
+ if id >= 0:
+ opt.gpu_ids.append(id)
+ #if len(opt.gpu_ids) > 0:
+ # torch.cuda.set_device(opt.gpu_ids[0])
+
+ self.opt = opt
+ return self.opt
+
+
+def estimateboost(img, model, model_type, pix2pixmodel, max_res=512, depthmap_script_boost_rmax=None):
+ global whole_size_threshold
+
+ # get settings
+ if depthmap_script_boost_rmax:
+ whole_size_threshold = depthmap_script_boost_rmax
+
+ if model_type == 0: #leres
+ net_receptive_field_size = 448
+ patch_netsize = 2 * net_receptive_field_size
+ elif model_type == 1: #dpt_beit_large_512
+ net_receptive_field_size = 512
+ patch_netsize = 2 * net_receptive_field_size
+ else: #other midas
+ net_receptive_field_size = 384
+ patch_netsize = 2 * net_receptive_field_size
+
+ gc.collect()
+ torch_gc()
+
+ # Generate mask used to smoothly blend the local pathc estimations to the base estimate.
+ # It is arbitrarily large to avoid artifacts during rescaling for each crop.
+ mask_org = generatemask((3000, 3000))
+ mask = mask_org.copy()
+
+ # Value x of R_x defined in the section 5 of the main paper.
+ r_threshold_value = 0.2
+ #if R0:
+ # r_threshold_value = 0
+
+ input_resolution = img.shape
+ scale_threshold = 3 # Allows up-scaling with a scale up to 3
+
+ # Find the best input resolution R-x. The resolution search described in section 5-double estimation of the main paper and section B of the
+ # supplementary material.
+ whole_image_optimal_size, patch_scale = calculateprocessingres(img, net_receptive_field_size, r_threshold_value, scale_threshold, whole_size_threshold)
+
+ # print('wholeImage being processed in :', whole_image_optimal_size)
+
+ # Generate the base estimate using the double estimation.
+ whole_estimate = doubleestimate(img, net_receptive_field_size, whole_image_optimal_size, pix2pixsize, model, model_type, pix2pixmodel)
+
+ # Compute the multiplier described in section 6 of the main paper to make sure our initial patch can select
+ # small high-density regions of the image.
+ global factor
+ factor = max(min(1, 4 * patch_scale * whole_image_optimal_size / whole_size_threshold), 0.2)
+ # print('Adjust factor is:', 1/factor)
+
+ # Check if Local boosting is beneficial.
+ if max_res < whole_image_optimal_size:
+ # print("No Local boosting. Specified Max Res is smaller than R20, Returning doubleestimate result")
+ return cv2.resize(whole_estimate, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC)
+
+ # Compute the default target resolution.
+ if img.shape[0] > img.shape[1]:
+ a = 2 * whole_image_optimal_size
+ b = round(2 * whole_image_optimal_size * img.shape[1] / img.shape[0])
+ else:
+ a = round(2 * whole_image_optimal_size * img.shape[0] / img.shape[1])
+ b = 2 * whole_image_optimal_size
+ b = int(round(b / factor))
+ a = int(round(a / factor))
+
+ """
+ # recompute a, b and saturate to max res.
+ if max(a,b) > max_res:
+ print('Default Res is higher than max-res: Reducing final resolution')
+ if img.shape[0] > img.shape[1]:
+ a = max_res
+ b = round(max_res * img.shape[1] / img.shape[0])
+ else:
+ a = round(max_res * img.shape[0] / img.shape[1])
+ b = max_res
+ b = int(b)
+ a = int(a)
+ """
+
+ img = cv2.resize(img, (b, a), interpolation=cv2.INTER_CUBIC)
+
+ # Extract selected patches for local refinement
+ base_size = net_receptive_field_size * 2
+ patchset = generatepatchs(img, base_size)
+
+ # print('Target resolution: ', img.shape)
+
+ # Computing a scale in case user prompted to generate the results as the same resolution of the input.
+ # Notice that our method output resolution is independent of the input resolution and this parameter will only
+ # enable a scaling operation during the local patch merge implementation to generate results with the same resolution
+ # as the input.
+ """
+ if output_resolution == 1:
+ mergein_scale = input_resolution[0] / img.shape[0]
+ print('Dynamicly change merged-in resolution; scale:', mergein_scale)
+ else:
+ mergein_scale = 1
+ """
+ # always rescale to input res for now
+ mergein_scale = input_resolution[0] / img.shape[0]
+
+ imageandpatchs = ImageandPatchs('', '', patchset, img, mergein_scale)
+ whole_estimate_resized = cv2.resize(whole_estimate, (round(img.shape[1]*mergein_scale),
+ round(img.shape[0]*mergein_scale)), interpolation=cv2.INTER_CUBIC)
+ imageandpatchs.set_base_estimate(whole_estimate_resized.copy())
+ imageandpatchs.set_updated_estimate(whole_estimate_resized.copy())
+
+ print('Resulting depthmap resolution will be :', whole_estimate_resized.shape[:2])
+ print('Patches to process: '+str(len(imageandpatchs)))
+
+ # Enumerate through all patches, generate their estimations and refining the base estimate.
+ for patch_ind in range(len(imageandpatchs)):
+
+ # Get patch information
+ patch = imageandpatchs[patch_ind] # patch object
+ patch_rgb = patch['patch_rgb'] # rgb patch
+ patch_whole_estimate_base = patch['patch_whole_estimate_base'] # corresponding patch from base
+ rect = patch['rect'] # patch size and location
+ patch_id = patch['id'] # patch ID
+ org_size = patch_whole_estimate_base.shape # the original size from the unscaled input
+ print('\t Processing patch', patch_ind, '/', len(imageandpatchs)-1, '|', rect)
+
+ # We apply double estimation for patches. The high resolution value is fixed to twice the receptive
+ # field size of the network for patches to accelerate the process.
+ patch_estimation = doubleestimate(patch_rgb, net_receptive_field_size, patch_netsize, pix2pixsize, model, model_type, pix2pixmodel)
+ patch_estimation = cv2.resize(patch_estimation, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+ patch_whole_estimate_base = cv2.resize(patch_whole_estimate_base, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+
+ # Merging the patch estimation into the base estimate using our merge network:
+ # We feed the patch estimation and the same region from the updated base estimate to the merge network
+ # to generate the target estimate for the corresponding region.
+ pix2pixmodel.set_input(patch_whole_estimate_base, patch_estimation)
+
+ # Run merging network
+ pix2pixmodel.test()
+ visuals = pix2pixmodel.get_current_visuals()
+
+ prediction_mapped = visuals['fake_B']
+ prediction_mapped = (prediction_mapped+1)/2
+ prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
+
+ mapped = prediction_mapped
+
+ # We use a simple linear polynomial to make sure the result of the merge network would match the values of
+ # base estimate
+ p_coef = np.polyfit(mapped.reshape(-1), patch_whole_estimate_base.reshape(-1), deg=1)
+ merged = np.polyval(p_coef, mapped.reshape(-1)).reshape(mapped.shape)
+
+ merged = cv2.resize(merged, (org_size[1],org_size[0]), interpolation=cv2.INTER_CUBIC)
+
+ # Get patch size and location
+ w1 = rect[0]
+ h1 = rect[1]
+ w2 = w1 + rect[2]
+ h2 = h1 + rect[3]
+
+ # To speed up the implementation, we only generate the Gaussian mask once with a sufficiently large size
+ # and resize it to our needed size while merging the patches.
+ if mask.shape != org_size:
+ mask = cv2.resize(mask_org, (org_size[1],org_size[0]), interpolation=cv2.INTER_LINEAR)
+
+ tobemergedto = imageandpatchs.estimation_updated_image
+
+ # Update the whole estimation:
+ # We use a simple Gaussian mask to blend the merged patch region with the base estimate to ensure seamless
+ # blending at the boundaries of the patch region.
+ tobemergedto[h1:h2, w1:w2] = np.multiply(tobemergedto[h1:h2, w1:w2], 1 - mask) + np.multiply(merged, mask)
+ imageandpatchs.set_updated_estimate(tobemergedto)
+
+ # output
+ return cv2.resize(imageandpatchs.estimation_updated_image, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/multi_depth_model_woauxi.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/multi_depth_model_woauxi.py
new file mode 100644
index 0000000000000000000000000000000000000000..837a847efccf8ad522fa9ac29a6fba4a273d1062
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/multi_depth_model_woauxi.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+from . import network_auxi as network
+from .net_tools import get_func
+
+
+class RelDepthModel(nn.Module):
+ def __init__(self, backbone='resnet50'):
+ super(RelDepthModel, self).__init__()
+ if backbone == 'resnet50':
+ encoder = 'resnet50_stride32'
+ elif backbone == 'resnext101':
+ encoder = 'resnext101_stride32x8d'
+ self.depth_model = DepthModel(encoder)
+
+ def inference(self, rgb):
+ with torch.no_grad():
+ input = rgb.to(self.depth_model.device)
+ depth = self.depth_model(input)
+ #pred_depth_out = depth - depth.min() + 0.01
+ return depth #pred_depth_out
+
+
+class DepthModel(nn.Module):
+ def __init__(self, encoder):
+ super(DepthModel, self).__init__()
+ backbone = network.__name__.split('.')[-1] + '.' + encoder
+ self.encoder_modules = get_func(backbone)()
+ self.decoder_modules = network.Decoder()
+
+ def forward(self, x):
+ lateral_out = self.encoder_modules(x)
+ out_logit = self.decoder_modules(lateral_out)
+ return out_logit
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/net_tools.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/net_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..f50b1a6498ac53e7991493f3347b4d136901e931
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/net_tools.py
@@ -0,0 +1,54 @@
+import importlib
+import torch
+import os
+from collections import OrderedDict
+
+
+def get_func(func_name):
+ """Helper to return a function object by name. func_name must identify a
+ function in this module or the path to a function relative to the base
+ 'modeling' module.
+ """
+ if func_name == '':
+ return None
+ try:
+ parts = func_name.split('.')
+ # Refers to a function in this module
+ if len(parts) == 1:
+ return globals()[parts[0]]
+ # Otherwise, assume we're referencing a module under modeling
+ module_name = 'custom_controlnet_aux.leres.leres.' + '.'.join(parts[:-1])
+ module = importlib.import_module(module_name)
+ return getattr(module, parts[-1])
+ except Exception:
+ print('Failed to f1ind function: %s', func_name)
+ raise
+
+def load_ckpt(args, depth_model, shift_model, focal_model):
+ """
+ Load checkpoint.
+ """
+ if os.path.isfile(args.load_ckpt):
+ print("loading checkpoint %s" % args.load_ckpt)
+ checkpoint = torch.load(args.load_ckpt)
+ if shift_model is not None:
+ shift_model.load_state_dict(strip_prefix_if_present(checkpoint['shift_model'], 'module.'),
+ strict=True)
+ if focal_model is not None:
+ focal_model.load_state_dict(strip_prefix_if_present(checkpoint['focal_model'], 'module.'),
+ strict=True)
+ depth_model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."),
+ strict=True)
+ del checkpoint
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+
+def strip_prefix_if_present(state_dict, prefix):
+ keys = sorted(state_dict.keys())
+ if not all(key.startswith(prefix) for key in keys):
+ return state_dict
+ stripped_state_dict = OrderedDict()
+ for key, value in state_dict.items():
+ stripped_state_dict[key.replace(prefix, "")] = value
+ return stripped_state_dict
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/network_auxi.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/network_auxi.py
new file mode 100644
index 0000000000000000000000000000000000000000..642c0e1228fd1a2703dd8cf0ce4f6b039c1cf705
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/leres/network_auxi.py
@@ -0,0 +1,417 @@
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+from . import Resnet, Resnext_torch
+
+
+def resnet50_stride32():
+ return DepthNet(backbone='resnet', depth=50, upfactors=[2, 2, 2, 2])
+
+def resnext101_stride32x8d():
+ return DepthNet(backbone='resnext101_32x8d', depth=101, upfactors=[2, 2, 2, 2])
+
+
+class Decoder(nn.Module):
+ def __init__(self):
+ super(Decoder, self).__init__()
+ self.inchannels = [256, 512, 1024, 2048]
+ self.midchannels = [256, 256, 256, 512]
+ self.upfactors = [2,2,2,2]
+ self.outchannels = 1
+
+ self.conv = FTB(inchannels=self.inchannels[3], midchannels=self.midchannels[3])
+ self.conv1 = nn.Conv2d(in_channels=self.midchannels[3], out_channels=self.midchannels[2], kernel_size=3, padding=1, stride=1, bias=True)
+ self.upsample = nn.Upsample(scale_factor=self.upfactors[3], mode='bilinear', align_corners=True)
+
+ self.ffm2 = FFM(inchannels=self.inchannels[2], midchannels=self.midchannels[2], outchannels = self.midchannels[2], upfactor=self.upfactors[2])
+ self.ffm1 = FFM(inchannels=self.inchannels[1], midchannels=self.midchannels[1], outchannels = self.midchannels[1], upfactor=self.upfactors[1])
+ self.ffm0 = FFM(inchannels=self.inchannels[0], midchannels=self.midchannels[0], outchannels = self.midchannels[0], upfactor=self.upfactors[0])
+
+ self.outconv = AO(inchannels=self.midchannels[0], outchannels=self.outchannels, upfactor=2)
+ self._init_params()
+
+ def _init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): #NN.BatchNorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+ def forward(self, features):
+ x_32x = self.conv(features[3]) # 1/32
+ x_32 = self.conv1(x_32x)
+ x_16 = self.upsample(x_32) # 1/16
+
+ x_8 = self.ffm2(features[2], x_16) # 1/8
+ x_4 = self.ffm1(features[1], x_8) # 1/4
+ x_2 = self.ffm0(features[0], x_4) # 1/2
+ #-----------------------------------------
+ x = self.outconv(x_2) # original size
+ return x
+
+class DepthNet(nn.Module):
+ __factory = {
+ 18: Resnet.resnet18,
+ 34: Resnet.resnet34,
+ 50: Resnet.resnet50,
+ 101: Resnet.resnet101,
+ 152: Resnet.resnet152
+ }
+ def __init__(self,
+ backbone='resnet',
+ depth=50,
+ upfactors=[2, 2, 2, 2]):
+ super(DepthNet, self).__init__()
+ self.backbone = backbone
+ self.depth = depth
+ self.pretrained = False
+ self.inchannels = [256, 512, 1024, 2048]
+ self.midchannels = [256, 256, 256, 512]
+ self.upfactors = upfactors
+ self.outchannels = 1
+
+ # Build model
+ if self.backbone == 'resnet':
+ if self.depth not in DepthNet.__factory:
+ raise KeyError("Unsupported depth:", self.depth)
+ self.encoder = DepthNet.__factory[depth](pretrained=self.pretrained)
+ elif self.backbone == 'resnext101_32x8d':
+ self.encoder = Resnext_torch.resnext101_32x8d(pretrained=self.pretrained)
+ else:
+ self.encoder = Resnext_torch.resnext101(pretrained=self.pretrained)
+
+ def forward(self, x):
+ x = self.encoder(x) # 1/32, 1/16, 1/8, 1/4
+ return x
+
+
+class FTB(nn.Module):
+ def __init__(self, inchannels, midchannels=512):
+ super(FTB, self).__init__()
+ self.in1 = inchannels
+ self.mid = midchannels
+ self.conv1 = nn.Conv2d(in_channels=self.in1, out_channels=self.mid, kernel_size=3, padding=1, stride=1,
+ bias=True)
+ # NN.BatchNorm2d
+ self.conv_branch = nn.Sequential(nn.ReLU(inplace=True), \
+ nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3,
+ padding=1, stride=1, bias=True), \
+ nn.BatchNorm2d(num_features=self.mid), \
+ nn.ReLU(inplace=True), \
+ nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3,
+ padding=1, stride=1, bias=True))
+ self.relu = nn.ReLU(inplace=True)
+
+ self.init_params()
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = x + self.conv_branch(x)
+ x = self.relu(x)
+
+ return x
+
+ def init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+
+class ATA(nn.Module):
+ def __init__(self, inchannels, reduction=8):
+ super(ATA, self).__init__()
+ self.inchannels = inchannels
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
+ self.fc = nn.Sequential(nn.Linear(self.inchannels * 2, self.inchannels // reduction),
+ nn.ReLU(inplace=True),
+ nn.Linear(self.inchannels // reduction, self.inchannels),
+ nn.Sigmoid())
+ self.init_params()
+
+ def forward(self, low_x, high_x):
+ n, c, _, _ = low_x.size()
+ x = torch.cat([low_x, high_x], 1)
+ x = self.avg_pool(x)
+ x = x.view(n, -1)
+ x = self.fc(x).view(n, c, 1, 1)
+ x = low_x * x + high_x
+
+ return x
+
+ def init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ # init.normal(m.weight, std=0.01)
+ init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ # init.normal_(m.weight, std=0.01)
+ init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+
+class FFM(nn.Module):
+ def __init__(self, inchannels, midchannels, outchannels, upfactor=2):
+ super(FFM, self).__init__()
+ self.inchannels = inchannels
+ self.midchannels = midchannels
+ self.outchannels = outchannels
+ self.upfactor = upfactor
+
+ self.ftb1 = FTB(inchannels=self.inchannels, midchannels=self.midchannels)
+ # self.ata = ATA(inchannels = self.midchannels)
+ self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels)
+
+ self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)
+
+ self.init_params()
+
+ def forward(self, low_x, high_x):
+ x = self.ftb1(low_x)
+ x = x + high_x
+ x = self.ftb2(x)
+ x = self.upsample(x)
+
+ return x
+
+ def init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): # NN.Batchnorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+
+class AO(nn.Module):
+ # Adaptive output module
+ def __init__(self, inchannels, outchannels, upfactor=2):
+ super(AO, self).__init__()
+ self.inchannels = inchannels
+ self.outchannels = outchannels
+ self.upfactor = upfactor
+
+ self.adapt_conv = nn.Sequential(
+ nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels // 2, kernel_size=3, padding=1,
+ stride=1, bias=True), \
+ nn.BatchNorm2d(num_features=self.inchannels // 2), \
+ nn.ReLU(inplace=True), \
+ nn.Conv2d(in_channels=self.inchannels // 2, out_channels=self.outchannels, kernel_size=3, padding=1,
+ stride=1, bias=True), \
+ nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True))
+
+ self.init_params()
+
+ def forward(self, x):
+ x = self.adapt_conv(x)
+ return x
+
+ def init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): # NN.Batchnorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+
+
+# ==============================================================================================================
+
+
+class ResidualConv(nn.Module):
+ def __init__(self, inchannels):
+ super(ResidualConv, self).__init__()
+ # NN.BatchNorm2d
+ self.conv = nn.Sequential(
+ # nn.BatchNorm2d(num_features=inchannels),
+ nn.ReLU(inplace=False),
+ # nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=3, padding=1, stride=1, groups=inchannels,bias=True),
+ # nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=1, padding=0, stride=1, groups=1,bias=True)
+ nn.Conv2d(in_channels=inchannels, out_channels=inchannels / 2, kernel_size=3, padding=1, stride=1,
+ bias=False),
+ nn.BatchNorm2d(num_features=inchannels / 2),
+ nn.ReLU(inplace=False),
+ nn.Conv2d(in_channels=inchannels / 2, out_channels=inchannels, kernel_size=3, padding=1, stride=1,
+ bias=False)
+ )
+ self.init_params()
+
+ def forward(self, x):
+ x = self.conv(x) + x
+ return x
+
+ def init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+
+class FeatureFusion(nn.Module):
+ def __init__(self, inchannels, outchannels):
+ super(FeatureFusion, self).__init__()
+ self.conv = ResidualConv(inchannels=inchannels)
+ # NN.BatchNorm2d
+ self.up = nn.Sequential(ResidualConv(inchannels=inchannels),
+ nn.ConvTranspose2d(in_channels=inchannels, out_channels=outchannels, kernel_size=3,
+ stride=2, padding=1, output_padding=1),
+ nn.BatchNorm2d(num_features=outchannels),
+ nn.ReLU(inplace=True))
+
+ def forward(self, lowfeat, highfeat):
+ return self.up(highfeat + self.conv(lowfeat))
+
+ def init_params(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # init.kaiming_normal_(m.weight, mode='fan_out')
+ init.normal_(m.weight, std=0.01)
+ # init.xavier_normal_(m.weight)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
+ init.constant_(m.weight, 1)
+ init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ init.normal_(m.weight, std=0.01)
+ if m.bias is not None:
+ init.constant_(m.bias, 0)
+
+
+class SenceUnderstand(nn.Module):
+ def __init__(self, channels):
+ super(SenceUnderstand, self).__init__()
+ self.channels = channels
+ self.conv1 = nn.Sequential(nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
+ nn.ReLU(inplace=True))
+ self.pool = nn.AdaptiveAvgPool2d(8)
+ self.fc = nn.Sequential(nn.Linear(512 * 8 * 8, self.channels),
+ nn.ReLU(inplace=True))
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_channels=self.channels, out_channels=self.channels, kernel_size=1, padding=0),
+ nn.ReLU(inplace=True))
+ self.initial_params()
+
+ def forward(self, x):
+ n, c, h, w = x.size()
+ x = self.conv1(x)
+ x = self.pool(x)
+ x = x.view(n, -1)
+ x = self.fc(x)
+ x = x.view(n, self.channels, 1, 1)
+ x = self.conv2(x)
+ x = x.repeat(1, 1, h, w)
+ return x
+
+ def initial_params(self, dev=0.01):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ # print torch.sum(m.weight)
+ m.weight.data.normal_(0, dev)
+ if m.bias is not None:
+ m.bias.data.fill_(0)
+ elif isinstance(m, nn.ConvTranspose2d):
+ # print torch.sum(m.weight)
+ m.weight.data.normal_(0, dev)
+ if m.bias is not None:
+ m.bias.data.fill_(0)
+ elif isinstance(m, nn.Linear):
+ m.weight.data.normal_(0, dev)
+
+
+if __name__ == '__main__':
+ net = DepthNet(depth=50, pretrained=True)
+ print(net)
+ inputs = torch.ones(4,3,128,128)
+ out = net(inputs)
+ print(out.size())
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2a8383334a8f31b0115ccf5d4d0c34e23c6f890b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/LICENSE
@@ -0,0 +1,19 @@
+https://github.com/compphoto/BoostingMonocularDepth
+
+Copyright 2021, Seyed Mahdi Hosseini Miangoleh, Sebastian Dille, Computational Photography Laboratory. All rights reserved.
+
+This software is for academic use only. A redistribution of this
+software, with or without modifications, has to be for academic
+use only, while giving the appropriate credit to the original
+authors of the software. The methods implemented as a part of
+this software may be covered under patents or patent applications.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..203ec30728be190f5ab8f6237cede3c1ab6e01f4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/__init__.py
@@ -0,0 +1,67 @@
+"""This package contains modules related to objective functions, optimizations, and network architectures.
+
+To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
+You need to implement the following five functions:
+ -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
+ -- : unpack data from dataset and apply preprocessing.
+ -- : produce intermediate results.
+ -- : calculate loss, gradients, and update network weights.
+ -- : (optionally) add model-specific options and set default options.
+
+In the function <__init__>, you need to define four lists:
+ -- self.loss_names (str list): specify the training losses that you want to plot and save.
+ -- self.model_names (str list): define networks used in our training.
+ -- self.visual_names (str list): specify the images that you want to display and save.
+ -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
+
+Now you can use the model class by specifying flag '--model dummy'.
+See our template model class 'template_model.py' for more details.
+"""
+
+import importlib
+from .base_model import BaseModel
+
+
+def find_model_using_name(model_name):
+ """Import the module "models/[model_name]_model.py".
+
+ In the file, the class called DatasetNameModel() will
+ be instantiated. It has to be a subclass of BaseModel,
+ and it is case-insensitive.
+ """
+ model_filename = "custom_controlnet_aux.leres.pix2pix.models." + model_name + "_model"
+ modellib = importlib.import_module(model_filename)
+ model = None
+ target_model_name = model_name.replace('_', '') + 'model'
+ for name, cls in modellib.__dict__.items():
+ if name.lower() == target_model_name.lower() \
+ and issubclass(cls, BaseModel):
+ model = cls
+
+ if model is None:
+ print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
+ exit(0)
+
+ return model
+
+
+def get_option_setter(model_name):
+ """Return the static method of the model class."""
+ model_class = find_model_using_name(model_name)
+ return model_class.modify_commandline_options
+
+
+def create_model(opt):
+ """Create a model given the option.
+
+ This function warps the class CustomDatasetDataLoader.
+ This is the main interface between this package and 'train.py'/'test.py'
+
+ Example:
+ >>> from models import create_model
+ >>> model = create_model(opt)
+ """
+ model = find_model_using_name(opt.model)
+ instance = model(opt)
+ print("model [%s] was created" % type(instance).__name__)
+ return instance
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/base_model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b6abae0eabbe8c7d390165679ffa8c553de3a72
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/base_model.py
@@ -0,0 +1,244 @@
+import gc
+import os
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+
+import torch
+
+from ....util import torch_gc
+from . import networks
+
+
+class BaseModel(ABC):
+ """This class is an abstract base class (ABC) for models.
+ To create a subclass, you need to implement the following five functions:
+ -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
+ -- : unpack data from dataset and apply preprocessing.
+ -- : produce intermediate results.
+ -- : calculate losses, gradients, and update network weights.
+ -- : (optionally) add model-specific options and set default options.
+ """
+
+ def __init__(self, opt):
+ """Initialize the BaseModel class.
+
+ Parameters:
+ opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+
+ When creating your custom class, you need to implement your own initialization.
+ In this function, you should first call
+ Then, you need to define four lists:
+ -- self.loss_names (str list): specify the training losses that you want to plot and save.
+ -- self.model_names (str list): define networks used in our training.
+ -- self.visual_names (str list): specify the images that you want to display and save.
+ -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
+ """
+ self.opt = opt
+ self.gpu_ids = opt.gpu_ids
+ self.isTrain = opt.isTrain
+ self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') # get device name: CPU or GPU
+ self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) # save all the checkpoints to save_dir
+ if opt.preprocess != 'scale_width': # with [scale_width], input images might have different sizes, which hurts the performance of cudnn.benchmark.
+ torch.backends.cudnn.benchmark = True
+ self.loss_names = []
+ self.model_names = []
+ self.visual_names = []
+ self.optimizers = []
+ self.image_paths = []
+ self.metric = 0 # used for learning rate policy 'plateau'
+
+ @staticmethod
+ def modify_commandline_options(parser, is_train):
+ """Add new model-specific options, and rewrite default values for existing options.
+
+ Parameters:
+ parser -- original option parser
+ is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+ Returns:
+ the modified parser.
+ """
+ return parser
+
+ @abstractmethod
+ def set_input(self, input):
+ """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+ Parameters:
+ input (dict): includes the data itself and its metadata information.
+ """
+ pass
+
+ @abstractmethod
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+ pass
+
+ @abstractmethod
+ def optimize_parameters(self):
+ """Calculate losses, gradients, and update network weights; called in every training iteration"""
+ pass
+
+ def setup(self, opt):
+ """Load and print networks; create schedulers
+
+ Parameters:
+ opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+ """
+ if self.isTrain:
+ self.schedulers = [networks.get_scheduler(optimizer, opt) for optimizer in self.optimizers]
+ if not self.isTrain or opt.continue_train:
+ load_suffix = 'iter_%d' % opt.load_iter if opt.load_iter > 0 else opt.epoch
+ self.load_networks(load_suffix)
+ self.print_networks(opt.verbose)
+
+ def eval(self):
+ """Make models eval mode during test time"""
+ for name in self.model_names:
+ if isinstance(name, str):
+ net = getattr(self, 'net' + name)
+ net.eval()
+
+ def test(self):
+ """Forward function used in test time.
+
+ This function wraps function in no_grad() so we don't save intermediate steps for backprop
+ It also calls to produce additional visualization results
+ """
+ with torch.no_grad():
+ self.forward()
+ self.compute_visuals()
+
+ def compute_visuals(self):
+ """Calculate additional output images for visdom and HTML visualization"""
+ pass
+
+ def get_image_paths(self):
+ """ Return image paths that are used to load current data"""
+ return self.image_paths
+
+ def update_learning_rate(self):
+ """Update learning rates for all the networks; called at the end of every epoch"""
+ old_lr = self.optimizers[0].param_groups[0]['lr']
+ for scheduler in self.schedulers:
+ if self.opt.lr_policy == 'plateau':
+ scheduler.step(self.metric)
+ else:
+ scheduler.step()
+
+ lr = self.optimizers[0].param_groups[0]['lr']
+ print('learning rate %.7f -> %.7f' % (old_lr, lr))
+
+ def get_current_visuals(self):
+ """Return visualization images. train.py will display these images with visdom, and save the images to a HTML"""
+ visual_ret = OrderedDict()
+ for name in self.visual_names:
+ if isinstance(name, str):
+ visual_ret[name] = getattr(self, name)
+ return visual_ret
+
+ def get_current_losses(self):
+ """Return traning losses / errors. train.py will print out these errors on console, and save them to a file"""
+ errors_ret = OrderedDict()
+ for name in self.loss_names:
+ if isinstance(name, str):
+ errors_ret[name] = float(getattr(self, 'loss_' + name)) # float(...) works for both scalar tensor and float number
+ return errors_ret
+
+ def save_networks(self, epoch):
+ """Save all the networks to the disk.
+
+ Parameters:
+ epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+ """
+ for name in self.model_names:
+ if isinstance(name, str):
+ save_filename = '%s_net_%s.pth' % (epoch, name)
+ save_path = os.path.join(self.save_dir, save_filename)
+ net = getattr(self, 'net' + name)
+
+ if len(self.gpu_ids) > 0 and torch.cuda.is_available():
+ torch.save(net.module.cpu().state_dict(), save_path)
+ net.cuda(self.gpu_ids[0])
+ else:
+ torch.save(net.cpu().state_dict(), save_path)
+
+ def unload_network(self, name):
+ """Unload network and gc.
+ """
+ if isinstance(name, str):
+ net = getattr(self, 'net' + name)
+ del net
+ gc.collect()
+ torch_gc()
+ return None
+
+ def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0):
+ """Fix InstanceNorm checkpoints incompatibility (prior to 0.4)"""
+ key = keys[i]
+ if i + 1 == len(keys): # at the end, pointing to a parameter/buffer
+ if module.__class__.__name__.startswith('InstanceNorm') and \
+ (key == 'running_mean' or key == 'running_var'):
+ if getattr(module, key) is None:
+ state_dict.pop('.'.join(keys))
+ if module.__class__.__name__.startswith('InstanceNorm') and \
+ (key == 'num_batches_tracked'):
+ state_dict.pop('.'.join(keys))
+ else:
+ self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1)
+
+ def load_networks(self, epoch):
+ """Load all the networks from the disk.
+
+ Parameters:
+ epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+ """
+ for name in self.model_names:
+ if isinstance(name, str):
+ load_filename = '%s_net_%s.pth' % (epoch, name)
+ load_path = os.path.join(self.save_dir, load_filename)
+ net = getattr(self, 'net' + name)
+ if isinstance(net, torch.nn.DataParallel):
+ net = net.module
+ # print('Loading depth boost model from %s' % load_path)
+ # if you are using PyTorch newer than 0.4 (e.g., built from
+ # GitHub source), you can remove str() on self.device
+ state_dict = torch.load(load_path, map_location=str(self.device))
+ if hasattr(state_dict, '_metadata'):
+ del state_dict._metadata
+
+ # patch InstanceNorm checkpoints prior to 0.4
+ for key in list(state_dict.keys()): # need to copy keys here because we mutate in loop
+ self.__patch_instance_norm_state_dict(state_dict, net, key.split('.'))
+ net.load_state_dict(state_dict)
+
+ def print_networks(self, verbose):
+ """Print the total number of parameters in the network and (if verbose) network architecture
+
+ Parameters:
+ verbose (bool) -- if verbose: print the network architecture
+ """
+ print('---------- Networks initialized -------------')
+ for name in self.model_names:
+ if isinstance(name, str):
+ net = getattr(self, 'net' + name)
+ num_params = 0
+ for param in net.parameters():
+ num_params += param.numel()
+ if verbose:
+ print(net)
+ print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6))
+ print('-----------------------------------------------')
+
+ def set_requires_grad(self, nets, requires_grad=False):
+ """Set requies_grad=Fasle for all the networks to avoid unnecessary computations
+ Parameters:
+ nets (network list) -- a list of networks
+ requires_grad (bool) -- whether the networks require gradients or not
+ """
+ if not isinstance(nets, list):
+ nets = [nets]
+ for net in nets:
+ if net is not None:
+ for param in net.parameters():
+ param.requires_grad = requires_grad
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/base_model_hg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/base_model_hg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7fb3d313978dec164eff1452ed2986e5655d6b6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/base_model_hg.py
@@ -0,0 +1,58 @@
+import os
+import torch
+
+class BaseModelHG():
+ def name(self):
+ return 'BaseModel'
+
+ def initialize(self, opt):
+ self.opt = opt
+ self.gpu_ids = opt.gpu_ids
+ self.isTrain = opt.isTrain
+ self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor
+ self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)
+
+ def set_input(self, input):
+ self.input = input
+
+ def forward(self):
+ pass
+
+ # used in test time, no backprop
+ def test(self):
+ pass
+
+ def get_image_paths(self):
+ pass
+
+ def optimize_parameters(self):
+ pass
+
+ def get_current_visuals(self):
+ return self.input
+
+ def get_current_errors(self):
+ return {}
+
+ def save(self, label):
+ pass
+
+ # helper saving function that can be used by subclasses
+ def save_network(self, network, network_label, epoch_label, gpu_ids):
+ save_filename = '_%s_net_%s.pth' % (epoch_label, network_label)
+ save_path = os.path.join(self.save_dir, save_filename)
+ torch.save(network.cpu().state_dict(), save_path)
+ if len(gpu_ids) and torch.cuda.is_available():
+ network.cuda(device_id=gpu_ids[0])
+
+ # helper loading function that can be used by subclasses
+ def load_network(self, network, network_label, epoch_label):
+ save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
+ save_path = os.path.join(self.save_dir, save_filename)
+ print(save_path)
+ model = torch.load(save_path)
+ return model
+ # network.load_state_dict(torch.load(save_path))
+
+ def update_learning_rate():
+ pass
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/networks.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3021ad53e8a6fef732d44b28f9d2f1424f0576d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/networks.py
@@ -0,0 +1,623 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+import functools
+from torch.optim import lr_scheduler
+
+
+###############################################################################
+# Helper Functions
+###############################################################################
+
+
+class Identity(nn.Module):
+ def forward(self, x):
+ return x
+
+
+def get_norm_layer(norm_type='instance'):
+ """Return a normalization layer
+
+ Parameters:
+ norm_type (str) -- the name of the normalization layer: batch | instance | none
+
+ For BatchNorm, we use learnable affine parameters and track running statistics (mean/stddev).
+ For InstanceNorm, we do not use learnable affine parameters. We do not track running statistics.
+ """
+ if norm_type == 'batch':
+ norm_layer = functools.partial(nn.BatchNorm2d, affine=True, track_running_stats=True)
+ elif norm_type == 'instance':
+ norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
+ elif norm_type == 'none':
+ def norm_layer(x): return Identity()
+ else:
+ raise NotImplementedError('normalization layer [%s] is not found' % norm_type)
+ return norm_layer
+
+
+def get_scheduler(optimizer, opt):
+ """Return a learning rate scheduler
+
+ Parameters:
+ optimizer -- the optimizer of the network
+ opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions.
+ opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
+
+ For 'linear', we keep the same learning rate for the first epochs
+ and linearly decay the rate to zero over the next epochs.
+ For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
+ See https://pytorch.org/docs/stable/optim.html for more details.
+ """
+ if opt.lr_policy == 'linear':
+ def lambda_rule(epoch):
+ lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.n_epochs) / float(opt.n_epochs_decay + 1)
+ return lr_l
+ scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
+ elif opt.lr_policy == 'step':
+ scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1)
+ elif opt.lr_policy == 'plateau':
+ scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
+ elif opt.lr_policy == 'cosine':
+ scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.n_epochs, eta_min=0)
+ else:
+ return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
+ return scheduler
+
+
+def init_weights(net, init_type='normal', init_gain=0.02):
+ """Initialize network weights.
+
+ Parameters:
+ net (network) -- network to be initialized
+ init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
+ init_gain (float) -- scaling factor for normal, xavier and orthogonal.
+
+ We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might
+ work better for some applications. Feel free to try yourself.
+ """
+ def init_func(m): # define the initialization function
+ classname = m.__class__.__name__
+ if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
+ if init_type == 'normal':
+ init.normal_(m.weight.data, 0.0, init_gain)
+ elif init_type == 'xavier':
+ init.xavier_normal_(m.weight.data, gain=init_gain)
+ elif init_type == 'kaiming':
+ init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+ elif init_type == 'orthogonal':
+ init.orthogonal_(m.weight.data, gain=init_gain)
+ else:
+ raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
+ if hasattr(m, 'bias') and m.bias is not None:
+ init.constant_(m.bias.data, 0.0)
+ elif classname.find('BatchNorm2d') != -1: # BatchNorm Layer's weight is not a matrix; only normal distribution applies.
+ init.normal_(m.weight.data, 1.0, init_gain)
+ init.constant_(m.bias.data, 0.0)
+
+ # print('initialize network with %s' % init_type)
+ net.apply(init_func) # apply the initialization function
+
+
+def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]):
+ """Initialize a network: 1. register CPU/GPU device (with multi-GPU support); 2. initialize the network weights
+ Parameters:
+ net (network) -- the network to be initialized
+ init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
+ gain (float) -- scaling factor for normal, xavier and orthogonal.
+ gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
+
+ Return an initialized network.
+ """
+ if len(gpu_ids) > 0:
+ assert(torch.cuda.is_available())
+ net.to(gpu_ids[0])
+ net = torch.nn.DataParallel(net, gpu_ids) # multi-GPUs
+ init_weights(net, init_type, init_gain=init_gain)
+ return net
+
+
+def define_G(input_nc, output_nc, ngf, netG, norm='batch', use_dropout=False, init_type='normal', init_gain=0.02, gpu_ids=[]):
+ """Create a generator
+
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ output_nc (int) -- the number of channels in output images
+ ngf (int) -- the number of filters in the last conv layer
+ netG (str) -- the architecture's name: resnet_9blocks | resnet_6blocks | unet_256 | unet_128
+ norm (str) -- the name of normalization layers used in the network: batch | instance | none
+ use_dropout (bool) -- if use dropout layers.
+ init_type (str) -- the name of our initialization method.
+ init_gain (float) -- scaling factor for normal, xavier and orthogonal.
+ gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
+
+ Returns a generator
+
+ Our current implementation provides two types of generators:
+ U-Net: [unet_128] (for 128x128 input images) and [unet_256] (for 256x256 input images)
+ The original U-Net paper: https://arxiv.org/abs/1505.04597
+
+ Resnet-based generator: [resnet_6blocks] (with 6 Resnet blocks) and [resnet_9blocks] (with 9 Resnet blocks)
+ Resnet-based generator consists of several Resnet blocks between a few downsampling/upsampling operations.
+ We adapt Torch code from Justin Johnson's neural style transfer project (https://github.com/jcjohnson/fast-neural-style).
+
+
+ The generator has been initialized by . It uses RELU for non-linearity.
+ """
+ net = None
+ norm_layer = get_norm_layer(norm_type=norm)
+
+ if netG == 'resnet_9blocks':
+ net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=9)
+ elif netG == 'resnet_6blocks':
+ net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=6)
+ elif netG == 'resnet_12blocks':
+ net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=12)
+ elif netG == 'unet_128':
+ net = UnetGenerator(input_nc, output_nc, 7, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
+ elif netG == 'unet_256':
+ net = UnetGenerator(input_nc, output_nc, 8, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
+ elif netG == 'unet_672':
+ net = UnetGenerator(input_nc, output_nc, 5, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
+ elif netG == 'unet_960':
+ net = UnetGenerator(input_nc, output_nc, 6, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
+ elif netG == 'unet_1024':
+ net = UnetGenerator(input_nc, output_nc, 10, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
+ else:
+ raise NotImplementedError('Generator model name [%s] is not recognized' % netG)
+ return init_net(net, init_type, init_gain, gpu_ids)
+
+
+def define_D(input_nc, ndf, netD, n_layers_D=3, norm='batch', init_type='normal', init_gain=0.02, gpu_ids=[]):
+ """Create a discriminator
+
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ ndf (int) -- the number of filters in the first conv layer
+ netD (str) -- the architecture's name: basic | n_layers | pixel
+ n_layers_D (int) -- the number of conv layers in the discriminator; effective when netD=='n_layers'
+ norm (str) -- the type of normalization layers used in the network.
+ init_type (str) -- the name of the initialization method.
+ init_gain (float) -- scaling factor for normal, xavier and orthogonal.
+ gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
+
+ Returns a discriminator
+
+ Our current implementation provides three types of discriminators:
+ [basic]: 'PatchGAN' classifier described in the original pix2pix paper.
+ It can classify whether 70×70 overlapping patches are real or fake.
+ Such a patch-level discriminator architecture has fewer parameters
+ than a full-image discriminator and can work on arbitrarily-sized images
+ in a fully convolutional fashion.
+
+ [n_layers]: With this mode, you can specify the number of conv layers in the discriminator
+ with the parameter (default=3 as used in [basic] (PatchGAN).)
+
+ [pixel]: 1x1 PixelGAN discriminator can classify whether a pixel is real or not.
+ It encourages greater color diversity but has no effect on spatial statistics.
+
+ The discriminator has been initialized by . It uses Leakly RELU for non-linearity.
+ """
+ net = None
+ norm_layer = get_norm_layer(norm_type=norm)
+
+ if netD == 'basic': # default PatchGAN classifier
+ net = NLayerDiscriminator(input_nc, ndf, n_layers=3, norm_layer=norm_layer)
+ elif netD == 'n_layers': # more options
+ net = NLayerDiscriminator(input_nc, ndf, n_layers_D, norm_layer=norm_layer)
+ elif netD == 'pixel': # classify if each pixel is real or fake
+ net = PixelDiscriminator(input_nc, ndf, norm_layer=norm_layer)
+ else:
+ raise NotImplementedError('Discriminator model name [%s] is not recognized' % netD)
+ return init_net(net, init_type, init_gain, gpu_ids)
+
+
+##############################################################################
+# Classes
+##############################################################################
+class GANLoss(nn.Module):
+ """Define different GAN objectives.
+
+ The GANLoss class abstracts away the need to create the target label tensor
+ that has the same size as the input.
+ """
+
+ def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0):
+ """ Initialize the GANLoss class.
+
+ Parameters:
+ gan_mode (str) - - the type of GAN objective. It currently supports vanilla, lsgan, and wgangp.
+ target_real_label (bool) - - label for a real image
+ target_fake_label (bool) - - label of a fake image
+
+ Note: Do not use sigmoid as the last layer of Discriminator.
+ LSGAN needs no sigmoid. vanilla GANs will handle it with BCEWithLogitsLoss.
+ """
+ super(GANLoss, self).__init__()
+ self.register_buffer('real_label', torch.tensor(target_real_label))
+ self.register_buffer('fake_label', torch.tensor(target_fake_label))
+ self.gan_mode = gan_mode
+ if gan_mode == 'lsgan':
+ self.loss = nn.MSELoss()
+ elif gan_mode == 'vanilla':
+ self.loss = nn.BCEWithLogitsLoss()
+ elif gan_mode in ['wgangp']:
+ self.loss = None
+ else:
+ raise NotImplementedError('gan mode %s not implemented' % gan_mode)
+
+ def get_target_tensor(self, prediction, target_is_real):
+ """Create label tensors with the same size as the input.
+
+ Parameters:
+ prediction (tensor) - - tpyically the prediction from a discriminator
+ target_is_real (bool) - - if the ground truth label is for real images or fake images
+
+ Returns:
+ A label tensor filled with ground truth label, and with the size of the input
+ """
+
+ if target_is_real:
+ target_tensor = self.real_label
+ else:
+ target_tensor = self.fake_label
+ return target_tensor.expand_as(prediction)
+
+ def __call__(self, prediction, target_is_real):
+ """Calculate loss given Discriminator's output and grount truth labels.
+
+ Parameters:
+ prediction (tensor) - - tpyically the prediction output from a discriminator
+ target_is_real (bool) - - if the ground truth label is for real images or fake images
+
+ Returns:
+ the calculated loss.
+ """
+ if self.gan_mode in ['lsgan', 'vanilla']:
+ target_tensor = self.get_target_tensor(prediction, target_is_real)
+ loss = self.loss(prediction, target_tensor)
+ elif self.gan_mode == 'wgangp':
+ if target_is_real:
+ loss = -prediction.mean()
+ else:
+ loss = prediction.mean()
+ return loss
+
+
+def cal_gradient_penalty(netD, real_data, fake_data, device, type='mixed', constant=1.0, lambda_gp=10.0):
+ """Calculate the gradient penalty loss, used in WGAN-GP paper https://arxiv.org/abs/1704.00028
+
+ Arguments:
+ netD (network) -- discriminator network
+ real_data (tensor array) -- real images
+ fake_data (tensor array) -- generated images from the generator
+ device (str) -- GPU / CPU: from torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu')
+ type (str) -- if we mix real and fake data or not [real | fake | mixed].
+ constant (float) -- the constant used in formula ( ||gradient||_2 - constant)^2
+ lambda_gp (float) -- weight for this loss
+
+ Returns the gradient penalty loss
+ """
+ if lambda_gp > 0.0:
+ if type == 'real': # either use real images, fake images, or a linear interpolation of two.
+ interpolatesv = real_data
+ elif type == 'fake':
+ interpolatesv = fake_data
+ elif type == 'mixed':
+ alpha = torch.rand(real_data.shape[0], 1, device=device)
+ alpha = alpha.expand(real_data.shape[0], real_data.nelement() // real_data.shape[0]).contiguous().view(*real_data.shape)
+ interpolatesv = alpha * real_data + ((1 - alpha) * fake_data)
+ else:
+ raise NotImplementedError('{} not implemented'.format(type))
+ interpolatesv.requires_grad_(True)
+ disc_interpolates = netD(interpolatesv)
+ gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=interpolatesv,
+ grad_outputs=torch.ones(disc_interpolates.size()).to(device),
+ create_graph=True, retain_graph=True, only_inputs=True)
+ gradients = gradients[0].view(real_data.size(0), -1) # flat the data
+ gradient_penalty = (((gradients + 1e-16).norm(2, dim=1) - constant) ** 2).mean() * lambda_gp # added eps
+ return gradient_penalty, gradients
+ else:
+ return 0.0, None
+
+
+class ResnetGenerator(nn.Module):
+ """Resnet-based generator that consists of Resnet blocks between a few downsampling/upsampling operations.
+
+ We adapt Torch code and idea from Justin Johnson's neural style transfer project(https://github.com/jcjohnson/fast-neural-style)
+ """
+
+ def __init__(self, input_nc, output_nc, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False, n_blocks=6, padding_type='reflect'):
+ """Construct a Resnet-based generator
+
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ output_nc (int) -- the number of channels in output images
+ ngf (int) -- the number of filters in the last conv layer
+ norm_layer -- normalization layer
+ use_dropout (bool) -- if use dropout layers
+ n_blocks (int) -- the number of ResNet blocks
+ padding_type (str) -- the name of padding layer in conv layers: reflect | replicate | zero
+ """
+ assert(n_blocks >= 0)
+ super(ResnetGenerator, self).__init__()
+ if type(norm_layer) == functools.partial:
+ use_bias = norm_layer.func == nn.InstanceNorm2d
+ else:
+ use_bias = norm_layer == nn.InstanceNorm2d
+
+ model = [nn.ReflectionPad2d(3),
+ nn.Conv2d(input_nc, ngf, kernel_size=7, padding=0, bias=use_bias),
+ norm_layer(ngf),
+ nn.ReLU(True)]
+
+ n_downsampling = 2
+ for i in range(n_downsampling): # add downsampling layers
+ mult = 2 ** i
+ model += [nn.Conv2d(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1, bias=use_bias),
+ norm_layer(ngf * mult * 2),
+ nn.ReLU(True)]
+
+ mult = 2 ** n_downsampling
+ for i in range(n_blocks): # add ResNet blocks
+
+ model += [ResnetBlock(ngf * mult, padding_type=padding_type, norm_layer=norm_layer, use_dropout=use_dropout, use_bias=use_bias)]
+
+ for i in range(n_downsampling): # add upsampling layers
+ mult = 2 ** (n_downsampling - i)
+ model += [nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2),
+ kernel_size=3, stride=2,
+ padding=1, output_padding=1,
+ bias=use_bias),
+ norm_layer(int(ngf * mult / 2)),
+ nn.ReLU(True)]
+ model += [nn.ReflectionPad2d(3)]
+ model += [nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+ model += [nn.Tanh()]
+
+ self.model = nn.Sequential(*model)
+
+ def forward(self, input):
+ """Standard forward"""
+ return self.model(input)
+
+
+class ResnetBlock(nn.Module):
+ """Define a Resnet block"""
+
+ def __init__(self, dim, padding_type, norm_layer, use_dropout, use_bias):
+ """Initialize the Resnet block
+
+ A resnet block is a conv block with skip connections
+ We construct a conv block with build_conv_block function,
+ and implement skip connections in function.
+ Original Resnet paper: https://arxiv.org/pdf/1512.03385.pdf
+ """
+ super(ResnetBlock, self).__init__()
+ self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, use_dropout, use_bias)
+
+ def build_conv_block(self, dim, padding_type, norm_layer, use_dropout, use_bias):
+ """Construct a convolutional block.
+
+ Parameters:
+ dim (int) -- the number of channels in the conv layer.
+ padding_type (str) -- the name of padding layer: reflect | replicate | zero
+ norm_layer -- normalization layer
+ use_dropout (bool) -- if use dropout layers.
+ use_bias (bool) -- if the conv layer uses bias or not
+
+ Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU))
+ """
+ conv_block = []
+ p = 0
+ if padding_type == 'reflect':
+ conv_block += [nn.ReflectionPad2d(1)]
+ elif padding_type == 'replicate':
+ conv_block += [nn.ReplicationPad2d(1)]
+ elif padding_type == 'zero':
+ p = 1
+ else:
+ raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+
+ conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim), nn.ReLU(True)]
+ if use_dropout:
+ conv_block += [nn.Dropout(0.5)]
+
+ p = 0
+ if padding_type == 'reflect':
+ conv_block += [nn.ReflectionPad2d(1)]
+ elif padding_type == 'replicate':
+ conv_block += [nn.ReplicationPad2d(1)]
+ elif padding_type == 'zero':
+ p = 1
+ else:
+ raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+ conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim)]
+
+ return nn.Sequential(*conv_block)
+
+ def forward(self, x):
+ """Forward function (with skip connections)"""
+ out = x + self.conv_block(x) # add skip connections
+ return out
+
+
+class UnetGenerator(nn.Module):
+ """Create a Unet-based generator"""
+
+ def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False):
+ """Construct a Unet generator
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ output_nc (int) -- the number of channels in output images
+ num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
+ image of size 128x128 will become of size 1x1 # at the bottleneck
+ ngf (int) -- the number of filters in the last conv layer
+ norm_layer -- normalization layer
+
+ We construct the U-Net from the innermost layer to the outermost layer.
+ It is a recursive process.
+ """
+ super(UnetGenerator, self).__init__()
+ # construct unet structure
+ unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer
+ for i in range(num_downs - 5): # add intermediate layers with ngf * 8 filters
+ unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout)
+ # gradually reduce the number of filters from ngf * 8 to ngf
+ unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+ unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+ unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+ self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer
+
+ def forward(self, input):
+ """Standard forward"""
+ return self.model(input)
+
+
+class UnetSkipConnectionBlock(nn.Module):
+ """Defines the Unet submodule with skip connection.
+ X -------------------identity----------------------
+ |-- downsampling -- |submodule| -- upsampling --|
+ """
+
+ def __init__(self, outer_nc, inner_nc, input_nc=None,
+ submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False):
+ """Construct a Unet submodule with skip connections.
+
+ Parameters:
+ outer_nc (int) -- the number of filters in the outer conv layer
+ inner_nc (int) -- the number of filters in the inner conv layer
+ input_nc (int) -- the number of channels in input images/features
+ submodule (UnetSkipConnectionBlock) -- previously defined submodules
+ outermost (bool) -- if this module is the outermost module
+ innermost (bool) -- if this module is the innermost module
+ norm_layer -- normalization layer
+ use_dropout (bool) -- if use dropout layers.
+ """
+ super(UnetSkipConnectionBlock, self).__init__()
+ self.outermost = outermost
+ if type(norm_layer) == functools.partial:
+ use_bias = norm_layer.func == nn.InstanceNorm2d
+ else:
+ use_bias = norm_layer == nn.InstanceNorm2d
+ if input_nc is None:
+ input_nc = outer_nc
+ downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4,
+ stride=2, padding=1, bias=use_bias)
+ downrelu = nn.LeakyReLU(0.2, True)
+ downnorm = norm_layer(inner_nc)
+ uprelu = nn.ReLU(True)
+ upnorm = norm_layer(outer_nc)
+
+ if outermost:
+ upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
+ kernel_size=4, stride=2,
+ padding=1)
+ down = [downconv]
+ up = [uprelu, upconv, nn.Tanh()]
+ model = down + [submodule] + up
+ elif innermost:
+ upconv = nn.ConvTranspose2d(inner_nc, outer_nc,
+ kernel_size=4, stride=2,
+ padding=1, bias=use_bias)
+ down = [downrelu, downconv]
+ up = [uprelu, upconv, upnorm]
+ model = down + up
+ else:
+ upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
+ kernel_size=4, stride=2,
+ padding=1, bias=use_bias)
+ down = [downrelu, downconv, downnorm]
+ up = [uprelu, upconv, upnorm]
+
+ if use_dropout:
+ model = down + [submodule] + up + [nn.Dropout(0.5)]
+ else:
+ model = down + [submodule] + up
+
+ self.model = nn.Sequential(*model)
+
+ def forward(self, x):
+ if self.outermost:
+ return self.model(x)
+ else: # add skip connections
+ return torch.cat([x, self.model(x)], 1)
+
+
+class NLayerDiscriminator(nn.Module):
+ """Defines a PatchGAN discriminator"""
+
+ def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d):
+ """Construct a PatchGAN discriminator
+
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ ndf (int) -- the number of filters in the last conv layer
+ n_layers (int) -- the number of conv layers in the discriminator
+ norm_layer -- normalization layer
+ """
+ super(NLayerDiscriminator, self).__init__()
+ if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
+ use_bias = norm_layer.func == nn.InstanceNorm2d
+ else:
+ use_bias = norm_layer == nn.InstanceNorm2d
+
+ kw = 4
+ padw = 1
+ sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+ nf_mult = 1
+ nf_mult_prev = 1
+ for n in range(1, n_layers): # gradually increase the number of filters
+ nf_mult_prev = nf_mult
+ nf_mult = min(2 ** n, 8)
+ sequence += [
+ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+ norm_layer(ndf * nf_mult),
+ nn.LeakyReLU(0.2, True)
+ ]
+
+ nf_mult_prev = nf_mult
+ nf_mult = min(2 ** n_layers, 8)
+ sequence += [
+ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+ norm_layer(ndf * nf_mult),
+ nn.LeakyReLU(0.2, True)
+ ]
+
+ sequence += [nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)] # output 1 channel prediction map
+ self.model = nn.Sequential(*sequence)
+
+ def forward(self, input):
+ """Standard forward."""
+ return self.model(input)
+
+
+class PixelDiscriminator(nn.Module):
+ """Defines a 1x1 PatchGAN discriminator (pixelGAN)"""
+
+ def __init__(self, input_nc, ndf=64, norm_layer=nn.BatchNorm2d):
+ """Construct a 1x1 PatchGAN discriminator
+
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ ndf (int) -- the number of filters in the last conv layer
+ norm_layer -- normalization layer
+ """
+ super(PixelDiscriminator, self).__init__()
+ if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
+ use_bias = norm_layer.func == nn.InstanceNorm2d
+ else:
+ use_bias = norm_layer == nn.InstanceNorm2d
+
+ self.net = [
+ nn.Conv2d(input_nc, ndf, kernel_size=1, stride=1, padding=0),
+ nn.LeakyReLU(0.2, True),
+ nn.Conv2d(ndf, ndf * 2, kernel_size=1, stride=1, padding=0, bias=use_bias),
+ norm_layer(ndf * 2),
+ nn.LeakyReLU(0.2, True),
+ nn.Conv2d(ndf * 2, 1, kernel_size=1, stride=1, padding=0, bias=use_bias)]
+
+ self.net = nn.Sequential(*self.net)
+
+ def forward(self, input):
+ """Standard forward."""
+ return self.net(input)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/pix2pix4depth_model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/pix2pix4depth_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a331b085cbc2f3a820434c797032131ff74fcf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/models/pix2pix4depth_model.py
@@ -0,0 +1,155 @@
+import torch
+from .base_model import BaseModel
+from . import networks
+
+
+class Pix2Pix4DepthModel(BaseModel):
+ """ This class implements the pix2pix model, for learning a mapping from input images to output images given paired data.
+
+ The model training requires '--dataset_mode aligned' dataset.
+ By default, it uses a '--netG unet256' U-Net generator,
+ a '--netD basic' discriminator (PatchGAN),
+ and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the orignal GAN paper).
+
+ pix2pix paper: https://arxiv.org/pdf/1611.07004.pdf
+ """
+ @staticmethod
+ def modify_commandline_options(parser, is_train=True):
+ """Add new dataset-specific options, and rewrite default values for existing options.
+
+ Parameters:
+ parser -- original option parser
+ is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+ Returns:
+ the modified parser.
+
+ For pix2pix, we do not use image buffer
+ The training objective is: GAN Loss + lambda_L1 * ||G(A)-B||_1
+ By default, we use vanilla GAN loss, UNet with batchnorm, and aligned datasets.
+ """
+ # changing the default values to match the pix2pix paper (https://phillipi.github.io/pix2pix/)
+ parser.set_defaults(input_nc=2,output_nc=1,norm='none', netG='unet_1024', dataset_mode='depthmerge')
+ if is_train:
+ parser.set_defaults(pool_size=0, gan_mode='vanilla',)
+ parser.add_argument('--lambda_L1', type=float, default=1000, help='weight for L1 loss')
+ return parser
+
+ def __init__(self, opt):
+ """Initialize the pix2pix class.
+
+ Parameters:
+ opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+ """
+ BaseModel.__init__(self, opt)
+ # specify the training losses you want to print out. The training/test scripts will call
+
+ self.loss_names = ['G_GAN', 'G_L1', 'D_real', 'D_fake']
+ # self.loss_names = ['G_L1']
+
+ # specify the images you want to save/display. The training/test scripts will call
+ if self.isTrain:
+ self.visual_names = ['outer','inner', 'fake_B', 'real_B']
+ else:
+ self.visual_names = ['fake_B']
+
+ # specify the models you want to save to the disk. The training/test scripts will call and
+ if self.isTrain:
+ self.model_names = ['G','D']
+ else: # during test time, only load G
+ self.model_names = ['G']
+
+ # define networks (both generator and discriminator)
+ self.netG = networks.define_G(opt.input_nc, opt.output_nc, 64, 'unet_1024', 'none',
+ False, 'normal', 0.02, self.gpu_ids)
+
+ if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
+ self.netD = networks.define_D(opt.input_nc + opt.output_nc, opt.ndf, opt.netD,
+ opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids)
+
+ if self.isTrain:
+ # define loss functions
+ self.criterionGAN = networks.GANLoss(opt.gan_mode).to(self.device)
+ self.criterionL1 = torch.nn.L1Loss()
+ # initialize optimizers; schedulers will be automatically created by function .
+ self.optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=1e-4, betas=(opt.beta1, 0.999))
+ self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=2e-06, betas=(opt.beta1, 0.999))
+ self.optimizers.append(self.optimizer_G)
+ self.optimizers.append(self.optimizer_D)
+
+ def set_input_train(self, input):
+ self.outer = input['data_outer'].to(self.device)
+ self.outer = torch.nn.functional.interpolate(self.outer,(1024,1024),mode='bilinear',align_corners=False)
+
+ self.inner = input['data_inner'].to(self.device)
+ self.inner = torch.nn.functional.interpolate(self.inner,(1024,1024),mode='bilinear',align_corners=False)
+
+ self.image_paths = input['image_path']
+
+ if self.isTrain:
+ self.gtfake = input['data_gtfake'].to(self.device)
+ self.gtfake = torch.nn.functional.interpolate(self.gtfake, (1024, 1024), mode='bilinear', align_corners=False)
+ self.real_B = self.gtfake
+
+ self.real_A = torch.cat((self.outer, self.inner), 1)
+
+ def set_input(self, outer, inner):
+ inner = torch.from_numpy(inner).unsqueeze(0).unsqueeze(0)
+ outer = torch.from_numpy(outer).unsqueeze(0).unsqueeze(0)
+
+ inner = (inner - torch.min(inner))/(torch.max(inner)-torch.min(inner))
+ outer = (outer - torch.min(outer))/(torch.max(outer)-torch.min(outer))
+
+ inner = self.normalize(inner)
+ outer = self.normalize(outer)
+
+ self.real_A = torch.cat((outer, inner), 1).to(self.device)
+
+
+ def normalize(self, input):
+ input = input * 2
+ input = input - 1
+ return input
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+ self.fake_B = self.netG(self.real_A) # G(A)
+
+ def backward_D(self):
+ """Calculate GAN loss for the discriminator"""
+ # Fake; stop backprop to the generator by detaching fake_B
+ fake_AB = torch.cat((self.real_A, self.fake_B), 1) # we use conditional GANs; we need to feed both input and output to the discriminator
+ pred_fake = self.netD(fake_AB.detach())
+ self.loss_D_fake = self.criterionGAN(pred_fake, False)
+ # Real
+ real_AB = torch.cat((self.real_A, self.real_B), 1)
+ pred_real = self.netD(real_AB)
+ self.loss_D_real = self.criterionGAN(pred_real, True)
+ # combine loss and calculate gradients
+ self.loss_D = (self.loss_D_fake + self.loss_D_real) * 0.5
+ self.loss_D.backward()
+
+ def backward_G(self):
+ """Calculate GAN and L1 loss for the generator"""
+ # First, G(A) should fake the discriminator
+ fake_AB = torch.cat((self.real_A, self.fake_B), 1)
+ pred_fake = self.netD(fake_AB)
+ self.loss_G_GAN = self.criterionGAN(pred_fake, True)
+ # Second, G(A) = B
+ self.loss_G_L1 = self.criterionL1(self.fake_B, self.real_B) * self.opt.lambda_L1
+ # combine loss and calculate gradients
+ self.loss_G = self.loss_G_L1 + self.loss_G_GAN
+ self.loss_G.backward()
+
+ def optimize_parameters(self):
+ self.forward() # compute fake images: G(A)
+ # update D
+ self.set_requires_grad(self.netD, True) # enable backprop for D
+ self.optimizer_D.zero_grad() # set D's gradients to zero
+ self.backward_D() # calculate gradients for D
+ self.optimizer_D.step() # update D's weights
+ # update G
+ self.set_requires_grad(self.netD, False) # D requires no gradients when optimizing G
+ self.optimizer_G.zero_grad() # set G's gradients to zero
+ self.backward_G() # calculate graidents for G
+ self.optimizer_G.step() # udpate G's weights
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06559aa558cf178b946c4523b28b098d1dfad606
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/__init__.py
@@ -0,0 +1 @@
+"""This package options includes option modules: training options, test options, and basic options (used in both training and test)."""
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/base_options.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/base_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c27091678f7014527bbad3a63c51657c9c72c4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/base_options.py
@@ -0,0 +1,156 @@
+import argparse
+import os
+from ...pix2pix.util import util
+# import torch
+from ...pix2pix import models
+# import pix2pix.data
+import numpy as np
+
+class BaseOptions():
+ """This class defines options used during both training and test time.
+
+ It also implements several helper functions such as parsing, printing, and saving the options.
+ It also gathers additional options defined in functions in both dataset class and model class.
+ """
+
+ def __init__(self):
+ """Reset the class; indicates the class hasn't been initailized"""
+ self.initialized = False
+
+ def initialize(self, parser):
+ """Define the common options that are used in both training and test."""
+ # basic parameters
+ parser.add_argument('--dataroot', help='path to images (should have subfolders trainA, trainB, valA, valB, etc)')
+ parser.add_argument('--name', type=str, default='void', help='mahdi_unet_new, scaled_unet')
+ parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0 0,1,2, 0,2. use -1 for CPU')
+ parser.add_argument('--checkpoints_dir', type=str, default='./pix2pix/checkpoints', help='models are saved here')
+ # model parameters
+ parser.add_argument('--model', type=str, default='cycle_gan', help='chooses which model to use. [cycle_gan | pix2pix | test | colorization]')
+ parser.add_argument('--input_nc', type=int, default=2, help='# of input image channels: 3 for RGB and 1 for grayscale')
+ parser.add_argument('--output_nc', type=int, default=1, help='# of output image channels: 3 for RGB and 1 for grayscale')
+ parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in the last conv layer')
+ parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in the first conv layer')
+ parser.add_argument('--netD', type=str, default='basic', help='specify discriminator architecture [basic | n_layers | pixel]. The basic model is a 70x70 PatchGAN. n_layers allows you to specify the layers in the discriminator')
+ parser.add_argument('--netG', type=str, default='resnet_9blocks', help='specify generator architecture [resnet_9blocks | resnet_6blocks | unet_256 | unet_128]')
+ parser.add_argument('--n_layers_D', type=int, default=3, help='only used if netD==n_layers')
+ parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization [instance | batch | none]')
+ parser.add_argument('--init_type', type=str, default='normal', help='network initialization [normal | xavier | kaiming | orthogonal]')
+ parser.add_argument('--init_gain', type=float, default=0.02, help='scaling factor for normal, xavier and orthogonal.')
+ parser.add_argument('--no_dropout', action='store_true', help='no dropout for the generator')
+ # dataset parameters
+ parser.add_argument('--dataset_mode', type=str, default='unaligned', help='chooses how datasets are loaded. [unaligned | aligned | single | colorization]')
+ parser.add_argument('--direction', type=str, default='AtoB', help='AtoB or BtoA')
+ parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')
+ parser.add_argument('--num_threads', default=4, type=int, help='# threads for loading data')
+ parser.add_argument('--batch_size', type=int, default=1, help='input batch size')
+ parser.add_argument('--load_size', type=int, default=672, help='scale images to this size')
+ parser.add_argument('--crop_size', type=int, default=672, help='then crop to this size')
+ parser.add_argument('--max_dataset_size', type=int, default=10000, help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')
+ parser.add_argument('--preprocess', type=str, default='resize_and_crop', help='scaling and cropping of images at load time [resize_and_crop | crop | scale_width | scale_width_and_crop | none]')
+ parser.add_argument('--no_flip', action='store_true', help='if specified, do not flip the images for data augmentation')
+ parser.add_argument('--display_winsize', type=int, default=256, help='display window size for both visdom and HTML')
+ # additional parameters
+ parser.add_argument('--epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
+ parser.add_argument('--load_iter', type=int, default='0', help='which iteration to load? if load_iter > 0, the code will load models by iter_[load_iter]; otherwise, the code will load models by [epoch]')
+ parser.add_argument('--verbose', action='store_true', help='if specified, print more debugging information')
+ parser.add_argument('--suffix', default='', type=str, help='customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}')
+
+ parser.add_argument('--data_dir', type=str, required=False,
+ help='input files directory images can be .png .jpg .tiff')
+ parser.add_argument('--output_dir', type=str, required=False,
+ help='result dir. result depth will be png. vides are JMPG as avi')
+ parser.add_argument('--savecrops', type=int, required=False)
+ parser.add_argument('--savewholeest', type=int, required=False)
+ parser.add_argument('--output_resolution', type=int, required=False,
+ help='0 for no restriction 1 for resize to input size')
+ parser.add_argument('--net_receptive_field_size', type=int, required=False)
+ parser.add_argument('--pix2pixsize', type=int, required=False)
+ parser.add_argument('--generatevideo', type=int, required=False)
+ parser.add_argument('--depthNet', type=int, required=False, help='0: midas 1:strurturedRL')
+ parser.add_argument('--R0', action='store_true')
+ parser.add_argument('--R20', action='store_true')
+ parser.add_argument('--Final', action='store_true')
+ parser.add_argument('--colorize_results', action='store_true')
+ parser.add_argument('--max_res', type=float, default=np.inf)
+
+ self.initialized = True
+ return parser
+
+ def gather_options(self):
+ """Initialize our parser with basic options(only once).
+ Add additional model-specific and dataset-specific options.
+ These options are defined in the function
+ in model and dataset classes.
+ """
+ if not self.initialized: # check if it has been initialized
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = self.initialize(parser)
+
+ # get the basic options
+ opt, _ = parser.parse_known_args()
+
+ # modify model-related parser options
+ model_name = opt.model
+ model_option_setter = models.get_option_setter(model_name)
+ parser = model_option_setter(parser, self.isTrain)
+ opt, _ = parser.parse_known_args() # parse again with new defaults
+
+ # modify dataset-related parser options
+ # dataset_name = opt.dataset_mode
+ # dataset_option_setter = pix2pix.data.get_option_setter(dataset_name)
+ # parser = dataset_option_setter(parser, self.isTrain)
+
+ # save and return the parser
+ self.parser = parser
+ #return parser.parse_args() #EVIL
+ return opt
+
+ def print_options(self, opt):
+ """Print and save options
+
+ It will print both current options and default values(if different).
+ It will save options into a text file / [checkpoints_dir] / opt.txt
+ """
+ message = ''
+ message += '----------------- Options ---------------\n'
+ for k, v in sorted(vars(opt).items()):
+ comment = ''
+ default = self.parser.get_default(k)
+ if v != default:
+ comment = '\t[default: %s]' % str(default)
+ message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
+ message += '----------------- End -------------------'
+ print(message)
+
+ # save to the disk
+ expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
+ util.mkdirs(expr_dir)
+ file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
+ with open(file_name, 'wt') as opt_file:
+ opt_file.write(message)
+ opt_file.write('\n')
+
+ def parse(self):
+ """Parse our options, create checkpoints directory suffix, and set up gpu device."""
+ opt = self.gather_options()
+ opt.isTrain = self.isTrain # train or test
+
+ # process opt.suffix
+ if opt.suffix:
+ suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
+ opt.name = opt.name + suffix
+
+ #self.print_options(opt)
+
+ # set gpu ids
+ str_ids = opt.gpu_ids.split(',')
+ opt.gpu_ids = []
+ for str_id in str_ids:
+ id = int(str_id)
+ if id >= 0:
+ opt.gpu_ids.append(id)
+ #if len(opt.gpu_ids) > 0:
+ # torch.cuda.set_device(opt.gpu_ids[0])
+
+ self.opt = opt
+ return self.opt
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/test_options.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/test_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef95de28fbce210fb4addbf96308f128ea5124b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/options/test_options.py
@@ -0,0 +1,22 @@
+from .base_options import BaseOptions
+
+
+class TestOptions(BaseOptions):
+ """This class includes test options.
+
+ It also includes shared options defined in BaseOptions.
+ """
+
+ def initialize(self, parser):
+ parser = BaseOptions.initialize(self, parser) # define shared options
+ parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images')
+ parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
+ # Dropout and Batchnorm has different behavioir during training and test.
+ parser.add_argument('--eval', action='store_true', help='use eval mode during test time.')
+ parser.add_argument('--num_test', type=int, default=50, help='how many test images to run')
+ # rewrite devalue values
+ parser.set_defaults(model='pix2pix4depth')
+ # To avoid cropping, the load_size should be the same as crop_size
+ parser.set_defaults(load_size=parser.get_default('crop_size'))
+ self.isTrain = False
+ return parser
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/util/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b73b864dc3cac1425752b6e0f60da59a47094813
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/util/__init__.py
@@ -0,0 +1 @@
+"""This package includes a miscellaneous collection of useful helper functions."""
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/util/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/util/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a688b7ae897431dca28d31d024e9191fe4d01402
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/leres/pix2pix/util/util.py
@@ -0,0 +1,105 @@
+"""This module contains simple helper functions """
+from __future__ import print_function
+import torch
+import numpy as np
+from PIL import Image
+import os
+
+
+def tensor2im(input_image, imtype=np.uint16):
+ """"Converts a Tensor array into a numpy image array.
+
+ Parameters:
+ input_image (tensor) -- the input image tensor array
+ imtype (type) -- the desired type of the converted numpy array
+ """
+ if not isinstance(input_image, np.ndarray):
+ if isinstance(input_image, torch.Tensor): # get the data from a variable
+ image_tensor = input_image.data
+ else:
+ return input_image
+ image_numpy = torch.squeeze(image_tensor).cpu().numpy() # convert it into a numpy array
+ image_numpy = (image_numpy + 1) / 2.0 * (2**16-1) #
+ else: # if it is a numpy array, do nothing
+ image_numpy = input_image
+ return image_numpy.astype(imtype)
+
+
+def diagnose_network(net, name='network'):
+ """Calculate and print the mean of average absolute(gradients)
+
+ Parameters:
+ net (torch network) -- Torch network
+ name (str) -- the name of the network
+ """
+ mean = 0.0
+ count = 0
+ for param in net.parameters():
+ if param.grad is not None:
+ mean += torch.mean(torch.abs(param.grad.data))
+ count += 1
+ if count > 0:
+ mean = mean / count
+ print(name)
+ print(mean)
+
+
+def save_image(image_numpy, image_path, aspect_ratio=1.0):
+ """Save a numpy image to the disk
+
+ Parameters:
+ image_numpy (numpy array) -- input numpy array
+ image_path (str) -- the path of the image
+ """
+ image_pil = Image.fromarray(image_numpy)
+
+ image_pil = image_pil.convert('I;16')
+
+ # image_pil = Image.fromarray(image_numpy)
+ # h, w, _ = image_numpy.shape
+ #
+ # if aspect_ratio > 1.0:
+ # image_pil = image_pil.resize((h, int(w * aspect_ratio)), Image.BICUBIC)
+ # if aspect_ratio < 1.0:
+ # image_pil = image_pil.resize((int(h / aspect_ratio), w), Image.BICUBIC)
+
+ image_pil.save(image_path)
+
+
+def print_numpy(x, val=True, shp=False):
+ """Print the mean, min, max, median, std, and size of a numpy array
+
+ Parameters:
+ val (bool) -- if print the values of the numpy array
+ shp (bool) -- if print the shape of the numpy array
+ """
+ x = x.astype(np.float64)
+ if shp:
+ print('shape,', x.shape)
+ if val:
+ x = x.flatten()
+ print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
+ np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
+
+
+def mkdirs(paths):
+ """create empty directories if they don't exist
+
+ Parameters:
+ paths (str list) -- a list of directory paths
+ """
+ if isinstance(paths, list) and not isinstance(paths, str):
+ for path in paths:
+ mkdir(path)
+ else:
+ mkdir(paths)
+
+
+def mkdir(path):
+ """create a single empty directory if it didn't exist
+
+ Parameters:
+ path (str) -- a single directory path
+ """
+ if not os.path.exists(path):
+ os.makedirs(path)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..aa50aff0b88acf132dda74e1e8d4049fc3bee6a3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Caroline Chan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8823cc75a3ea953525eaa4192cb8e45a5eb9d4d2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart/__init__.py
@@ -0,0 +1,141 @@
+import os
+import warnings
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download, HF_MODEL_NAME
+
+norm_layer = nn.InstanceNorm2d
+
+
+class ResidualBlock(nn.Module):
+ def __init__(self, in_features):
+ super(ResidualBlock, self).__init__()
+
+ conv_block = [ nn.ReflectionPad2d(1),
+ nn.Conv2d(in_features, in_features, 3),
+ norm_layer(in_features),
+ nn.ReLU(inplace=True),
+ nn.ReflectionPad2d(1),
+ nn.Conv2d(in_features, in_features, 3),
+ norm_layer(in_features)
+ ]
+
+ self.conv_block = nn.Sequential(*conv_block)
+
+ def forward(self, x):
+ return x + self.conv_block(x)
+
+
+class Generator(nn.Module):
+ def __init__(self, input_nc, output_nc, n_residual_blocks=9, sigmoid=True):
+ super(Generator, self).__init__()
+
+ # Initial convolution block
+ model0 = [ nn.ReflectionPad2d(3),
+ nn.Conv2d(input_nc, 64, 7),
+ norm_layer(64),
+ nn.ReLU(inplace=True) ]
+ self.model0 = nn.Sequential(*model0)
+
+ # Downsampling
+ model1 = []
+ in_features = 64
+ out_features = in_features*2
+ for _ in range(2):
+ model1 += [ nn.Conv2d(in_features, out_features, 3, stride=2, padding=1),
+ norm_layer(out_features),
+ nn.ReLU(inplace=True) ]
+ in_features = out_features
+ out_features = in_features*2
+ self.model1 = nn.Sequential(*model1)
+
+ model2 = []
+ # Residual blocks
+ for _ in range(n_residual_blocks):
+ model2 += [ResidualBlock(in_features)]
+ self.model2 = nn.Sequential(*model2)
+
+ # Upsampling
+ model3 = []
+ out_features = in_features//2
+ for _ in range(2):
+ model3 += [ nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1),
+ norm_layer(out_features),
+ nn.ReLU(inplace=True) ]
+ in_features = out_features
+ out_features = in_features//2
+ self.model3 = nn.Sequential(*model3)
+
+ # Output layer
+ model4 = [ nn.ReflectionPad2d(3),
+ nn.Conv2d(64, output_nc, 7)]
+ if sigmoid:
+ model4 += [nn.Sigmoid()]
+
+ self.model4 = nn.Sequential(*model4)
+
+ def forward(self, x, cond=None):
+ out = self.model0(x)
+ out = self.model1(out)
+ out = self.model2(out)
+ out = self.model3(out)
+ out = self.model4(out)
+
+ return out
+
+
+class LineartDetector:
+ def __init__(self, model, coarse_model):
+ self.model = model
+ self.model_coarse = coarse_model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="sk_model.pth", coarse_filename="sk_model2.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ coarse_model_path = custom_hf_download(pretrained_model_or_path, coarse_filename)
+
+ model = Generator(3, 1, 3)
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+ model.eval()
+
+ coarse_model = Generator(3, 1, 3)
+ coarse_model.load_state_dict(torch.load(coarse_model_path, map_location=torch.device('cpu')))
+ coarse_model.eval()
+
+ return cls(model, coarse_model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.model_coarse.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, coarse=False, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ model = self.model_coarse if coarse else self.model
+ assert detected_map.ndim == 3
+ with torch.no_grad():
+ image = torch.from_numpy(detected_map).float().to(self.device)
+ image = image / 255.0
+ image = rearrange(image, 'h w c -> 1 c h w')
+ line = model(image)[0][0]
+
+ line = line.cpu().numpy()
+ line = (line * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = HWC3(line)
+ detected_map = remove_pad(255 - detected_map)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_anime/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_anime/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..aa50aff0b88acf132dda74e1e8d4049fc3bee6a3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_anime/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Caroline Chan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_anime/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_anime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b060c6b81345a07cc07ca8d3d05dacc0916f68c6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_anime/__init__.py
@@ -0,0 +1,167 @@
+import functools
+import os
+import warnings
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download, HF_MODEL_NAME
+
+
+class UnetGenerator(nn.Module):
+ """Create a Unet-based generator"""
+
+ def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False):
+ """Construct a Unet generator
+ Parameters:
+ input_nc (int) -- the number of channels in input images
+ output_nc (int) -- the number of channels in output images
+ num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
+ image of size 128x128 will become of size 1x1 # at the bottleneck
+ ngf (int) -- the number of filters in the last conv layer
+ norm_layer -- normalization layer
+ We construct the U-Net from the innermost layer to the outermost layer.
+ It is a recursive process.
+ """
+ super(UnetGenerator, self).__init__()
+ # construct unet structure
+ unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer
+ for _ in range(num_downs - 5): # add intermediate layers with ngf * 8 filters
+ unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout)
+ # gradually reduce the number of filters from ngf * 8 to ngf
+ unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+ unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+ unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+ self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer
+
+ def forward(self, input):
+ """Standard forward"""
+ return self.model(input)
+
+
+class UnetSkipConnectionBlock(nn.Module):
+ """Defines the Unet submodule with skip connection.
+ X -------------------identity----------------------
+ |-- downsampling -- |submodule| -- upsampling --|
+ """
+
+ def __init__(self, outer_nc, inner_nc, input_nc=None,
+ submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False):
+ """Construct a Unet submodule with skip connections.
+ Parameters:
+ outer_nc (int) -- the number of filters in the outer conv layer
+ inner_nc (int) -- the number of filters in the inner conv layer
+ input_nc (int) -- the number of channels in input images/features
+ submodule (UnetSkipConnectionBlock) -- previously defined submodules
+ outermost (bool) -- if this module is the outermost module
+ innermost (bool) -- if this module is the innermost module
+ norm_layer -- normalization layer
+ use_dropout (bool) -- if use dropout layers.
+ """
+ super(UnetSkipConnectionBlock, self).__init__()
+ self.outermost = outermost
+ if type(norm_layer) == functools.partial:
+ use_bias = norm_layer.func == nn.InstanceNorm2d
+ else:
+ use_bias = norm_layer == nn.InstanceNorm2d
+ if input_nc is None:
+ input_nc = outer_nc
+ downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4,
+ stride=2, padding=1, bias=use_bias)
+ downrelu = nn.LeakyReLU(0.2, True)
+ downnorm = norm_layer(inner_nc)
+ uprelu = nn.ReLU(True)
+ upnorm = norm_layer(outer_nc)
+
+ if outermost:
+ upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
+ kernel_size=4, stride=2,
+ padding=1)
+ down = [downconv]
+ up = [uprelu, upconv, nn.Tanh()]
+ model = down + [submodule] + up
+ elif innermost:
+ upconv = nn.ConvTranspose2d(inner_nc, outer_nc,
+ kernel_size=4, stride=2,
+ padding=1, bias=use_bias)
+ down = [downrelu, downconv]
+ up = [uprelu, upconv, upnorm]
+ model = down + up
+ else:
+ upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
+ kernel_size=4, stride=2,
+ padding=1, bias=use_bias)
+ down = [downrelu, downconv, downnorm]
+ up = [uprelu, upconv, upnorm]
+
+ if use_dropout:
+ model = down + [submodule] + up + [nn.Dropout(0.5)]
+ else:
+ model = down + [submodule] + up
+
+ self.model = nn.Sequential(*model)
+
+ def forward(self, x):
+ if self.outermost:
+ return self.model(x)
+ else: # add skip connections
+ return torch.cat([x, self.model(x)], 1)
+
+
+class LineartAnimeDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="netG.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
+ net = UnetGenerator(3, 1, 8, 64, norm_layer=norm_layer, use_dropout=False)
+ ckpt = torch.load(model_path)
+ for key in list(ckpt.keys()):
+ if 'module.' in key:
+ ckpt[key.replace('module.', '')] = ckpt[key]
+ del ckpt[key]
+ net.load_state_dict(ckpt)
+ net.eval()
+
+ return cls(net)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ H, W, C = input_image.shape
+ Hn = 256 * int(np.ceil(float(H) / 256.0))
+ Wn = 256 * int(np.ceil(float(W) / 256.0))
+ input_image = cv2.resize(input_image, (Wn, Hn), interpolation=cv2.INTER_CUBIC)
+
+ with torch.no_grad():
+ image_feed = torch.from_numpy(input_image).float().to(self.device)
+ image_feed = image_feed / 127.5 - 1.0
+ image_feed = rearrange(image_feed, 'h w c -> 1 c h w')
+
+ line = self.model(image_feed)[0, 0] * 127.5 + 127.5
+ line = line.cpu().numpy()
+ line = line.clip(0, 255).astype(np.uint8)
+
+ #A1111 uses INTER AREA for downscaling so ig that is the best choice
+ detected_map = cv2.resize(HWC3(line), (W, H), interpolation=cv2.INTER_AREA)
+ detected_map = remove_pad(255 - detected_map)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_standard/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_standard/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d705e39c078553193f98cd749abbd8060136c1d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/lineart_standard/__init__.py
@@ -0,0 +1,21 @@
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import resize_image_with_pad, common_input_validate, HWC3
+
+class LineartStandardDetector:
+ def __call__(self, input_image=None, guassian_sigma=6.0, intensity_threshold=8, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ x = input_image.astype(np.float32)
+ g = cv2.GaussianBlur(x, (0, 0), guassian_sigma)
+ intensity = np.min(g - x, axis=2).clip(0, 255)
+ intensity /= max(16, np.median(intensity[intensity > intensity_threshold]))
+ intensity *= 127
+ detected_map = intensity.clip(0, 255).astype(np.uint8)
+
+ detected_map = HWC3(remove_pad(detected_map))
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..15dd44609392180e39d9a715cd7f126565412054
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Miaomiao Li
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68d50feb7f3ad75db90a9fa159a3a73c1094673d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/__init__.py
@@ -0,0 +1,63 @@
+# MangaLineExtraction_PyTorch
+# https://github.com/ljsabc/MangaLineExtraction_PyTorch
+
+#NOTE: This preprocessor is designed to work with lineart_anime ControlNet so the result will be white lines on black canvas
+
+import torch
+import numpy as np
+import os
+import cv2
+from einops import rearrange
+from .model_torch import res_skip
+from PIL import Image
+import warnings
+
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download, HF_MODEL_NAME
+
+class LineartMangaDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="erika.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ net = res_skip()
+ ckpt = torch.load(model_path)
+ for key in list(ckpt.keys()):
+ if 'module.' in key:
+ ckpt[key.replace('module.', '')] = ckpt[key]
+ del ckpt[key]
+ net.load_state_dict(ckpt)
+ net.eval()
+ return cls(net)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, 256 * int(np.ceil(float(detect_resolution) / 256.0)), upscale_method)
+
+ img = cv2.cvtColor(detected_map, cv2.COLOR_RGB2GRAY)
+ with torch.no_grad():
+ image_feed = torch.from_numpy(img).float().to(self.device)
+ image_feed = rearrange(image_feed, 'h w -> 1 1 h w')
+
+ line = self.model(image_feed)
+ line = line.cpu().numpy()[0,0,:,:]
+ line[line > 255] = 255
+ line[line < 0] = 0
+
+ line = line.astype(np.uint8)
+
+ detected_map = HWC3(line)
+ detected_map = remove_pad(255 - detected_map)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/model_torch.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/model_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0c8913445fc39f72917aecec144a8c0800f63f4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/manga_line/model_torch.py
@@ -0,0 +1,196 @@
+import torch.nn as nn
+import numpy as np
+
+#torch.set_printoptions(precision=10)
+
+
+class _bn_relu_conv(nn.Module):
+ def __init__(self, in_filters, nb_filters, fw, fh, subsample=1):
+ super(_bn_relu_conv, self).__init__()
+ self.model = nn.Sequential(
+ nn.BatchNorm2d(in_filters, eps=1e-3),
+ nn.LeakyReLU(0.2),
+ nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2), padding_mode='zeros')
+ )
+
+ def forward(self, x):
+ return self.model(x)
+
+ # the following are for debugs
+ print("****", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape)
+ for i,layer in enumerate(self.model):
+ if i != 2:
+ x = layer(x)
+ else:
+ x = layer(x)
+ #x = nn.functional.pad(x, (1, 1, 1, 1), mode='constant', value=0)
+ print("____", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape)
+ print(x[0])
+ return x
+
+
+class _u_bn_relu_conv(nn.Module):
+ def __init__(self, in_filters, nb_filters, fw, fh, subsample=1):
+ super(_u_bn_relu_conv, self).__init__()
+ self.model = nn.Sequential(
+ nn.BatchNorm2d(in_filters, eps=1e-3),
+ nn.LeakyReLU(0.2),
+ nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2)),
+ nn.Upsample(scale_factor=2, mode='nearest')
+ )
+
+ def forward(self, x):
+ return self.model(x)
+
+
+
+class _shortcut(nn.Module):
+ def __init__(self, in_filters, nb_filters, subsample=1):
+ super(_shortcut, self).__init__()
+ self.process = False
+ self.model = None
+ if in_filters != nb_filters or subsample != 1:
+ self.process = True
+ self.model = nn.Sequential(
+ nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample)
+ )
+
+ def forward(self, x, y):
+ #print(x.size(), y.size(), self.process)
+ if self.process:
+ y0 = self.model(x)
+ #print("merge+", torch.max(y0+y), torch.min(y0+y),torch.mean(y0+y), torch.std(y0+y), y0.shape)
+ return y0 + y
+ else:
+ #print("merge", torch.max(x+y), torch.min(x+y),torch.mean(x+y), torch.std(x+y), y.shape)
+ return x + y
+
+class _u_shortcut(nn.Module):
+ def __init__(self, in_filters, nb_filters, subsample):
+ super(_u_shortcut, self).__init__()
+ self.process = False
+ self.model = None
+ if in_filters != nb_filters:
+ self.process = True
+ self.model = nn.Sequential(
+ nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample, padding_mode='zeros'),
+ nn.Upsample(scale_factor=2, mode='nearest')
+ )
+
+ def forward(self, x, y):
+ if self.process:
+ return self.model(x) + y
+ else:
+ return x + y
+
+
+class basic_block(nn.Module):
+ def __init__(self, in_filters, nb_filters, init_subsample=1):
+ super(basic_block, self).__init__()
+ self.conv1 = _bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample)
+ self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3)
+ self.shortcut = _shortcut(in_filters, nb_filters, subsample=init_subsample)
+
+ def forward(self, x):
+ x1 = self.conv1(x)
+ x2 = self.residual(x1)
+ return self.shortcut(x, x2)
+
+class _u_basic_block(nn.Module):
+ def __init__(self, in_filters, nb_filters, init_subsample=1):
+ super(_u_basic_block, self).__init__()
+ self.conv1 = _u_bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample)
+ self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3)
+ self.shortcut = _u_shortcut(in_filters, nb_filters, subsample=init_subsample)
+
+ def forward(self, x):
+ y = self.residual(self.conv1(x))
+ return self.shortcut(x, y)
+
+
+class _residual_block(nn.Module):
+ def __init__(self, in_filters, nb_filters, repetitions, is_first_layer=False):
+ super(_residual_block, self).__init__()
+ layers = []
+ for i in range(repetitions):
+ init_subsample = 1
+ if i == repetitions - 1 and not is_first_layer:
+ init_subsample = 2
+ if i == 0:
+ l = basic_block(in_filters=in_filters, nb_filters=nb_filters, init_subsample=init_subsample)
+ else:
+ l = basic_block(in_filters=nb_filters, nb_filters=nb_filters, init_subsample=init_subsample)
+ layers.append(l)
+
+ self.model = nn.Sequential(*layers)
+
+ def forward(self, x):
+ return self.model(x)
+
+
+class _upsampling_residual_block(nn.Module):
+ def __init__(self, in_filters, nb_filters, repetitions):
+ super(_upsampling_residual_block, self).__init__()
+ layers = []
+ for i in range(repetitions):
+ l = None
+ if i == 0:
+ l = _u_basic_block(in_filters=in_filters, nb_filters=nb_filters)#(input)
+ else:
+ l = basic_block(in_filters=nb_filters, nb_filters=nb_filters)#(input)
+ layers.append(l)
+
+ self.model = nn.Sequential(*layers)
+
+ def forward(self, x):
+ return self.model(x)
+
+
+class res_skip(nn.Module):
+
+ def __init__(self):
+ super(res_skip, self).__init__()
+ self.block0 = _residual_block(in_filters=1, nb_filters=24, repetitions=2, is_first_layer=True)#(input)
+ self.block1 = _residual_block(in_filters=24, nb_filters=48, repetitions=3)#(block0)
+ self.block2 = _residual_block(in_filters=48, nb_filters=96, repetitions=5)#(block1)
+ self.block3 = _residual_block(in_filters=96, nb_filters=192, repetitions=7)#(block2)
+ self.block4 = _residual_block(in_filters=192, nb_filters=384, repetitions=12)#(block3)
+
+ self.block5 = _upsampling_residual_block(in_filters=384, nb_filters=192, repetitions=7)#(block4)
+ self.res1 = _shortcut(in_filters=192, nb_filters=192)#(block3, block5, subsample=(1,1))
+
+ self.block6 = _upsampling_residual_block(in_filters=192, nb_filters=96, repetitions=5)#(res1)
+ self.res2 = _shortcut(in_filters=96, nb_filters=96)#(block2, block6, subsample=(1,1))
+
+ self.block7 = _upsampling_residual_block(in_filters=96, nb_filters=48, repetitions=3)#(res2)
+ self.res3 = _shortcut(in_filters=48, nb_filters=48)#(block1, block7, subsample=(1,1))
+
+ self.block8 = _upsampling_residual_block(in_filters=48, nb_filters=24, repetitions=2)#(res3)
+ self.res4 = _shortcut(in_filters=24, nb_filters=24)#(block0,block8, subsample=(1,1))
+
+ self.block9 = _residual_block(in_filters=24, nb_filters=16, repetitions=2, is_first_layer=True)#(res4)
+ self.conv15 = _bn_relu_conv(in_filters=16, nb_filters=1, fh=1, fw=1, subsample=1)#(block7)
+
+ def forward(self, x):
+ x0 = self.block0(x)
+ x1 = self.block1(x0)
+ x2 = self.block2(x1)
+ x3 = self.block3(x2)
+ x4 = self.block4(x3)
+
+ x5 = self.block5(x4)
+ res1 = self.res1(x3, x5)
+
+ x6 = self.block6(res1)
+ res2 = self.res2(x2, x6)
+
+ x7 = self.block7(res2)
+ res3 = self.res3(x1, x7)
+
+ x8 = self.block8(res3)
+ res4 = self.res4(x0, x8)
+
+ x9 = self.block9(res4)
+ y = self.conv15(x9)
+
+ return y
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mediapipe_face/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mediapipe_face/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d99fa28e4015a5f9ad8157ca5d8f1e9d69d3dde2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mediapipe_face/__init__.py
@@ -0,0 +1,31 @@
+import warnings
+from typing import Union
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad
+from .mediapipe_face_common import generate_annotation
+
+
+class MediapipeFaceDetector:
+ def __call__(self,
+ input_image: Union[np.ndarray, Image.Image] = None,
+ max_faces: int = 1,
+ min_confidence: float = 0.5,
+ output_type: str = "pil",
+ detect_resolution: int = 512,
+ image_resolution: int = 512,
+ upscale_method="INTER_CUBIC",
+ **kwargs):
+
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ detected_map = generate_annotation(detected_map, max_faces, min_confidence)
+ detected_map = remove_pad(HWC3(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mediapipe_face/mediapipe_face_common.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mediapipe_face/mediapipe_face_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..baade5028aefa98ac3d511c7cc157577297b82a7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mediapipe_face/mediapipe_face_common.py
@@ -0,0 +1,156 @@
+from typing import Mapping
+import warnings
+
+import mediapipe as mp
+import numpy
+
+if mp:
+ mp_drawing = mp.solutions.drawing_utils
+ mp_drawing_styles = mp.solutions.drawing_styles
+ mp_face_detection = mp.solutions.face_detection # Only for counting faces.
+ mp_face_mesh = mp.solutions.face_mesh
+ mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION
+ mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS
+ mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS
+
+ DrawingSpec = mp.solutions.drawing_styles.DrawingSpec
+ PoseLandmark = mp.solutions.drawing_styles.PoseLandmark
+
+ min_face_size_pixels: int = 64
+ f_thick = 2
+ f_rad = 1
+ right_iris_draw = DrawingSpec(color=(10, 200, 250), thickness=f_thick, circle_radius=f_rad)
+ right_eye_draw = DrawingSpec(color=(10, 200, 180), thickness=f_thick, circle_radius=f_rad)
+ right_eyebrow_draw = DrawingSpec(color=(10, 220, 180), thickness=f_thick, circle_radius=f_rad)
+ left_iris_draw = DrawingSpec(color=(250, 200, 10), thickness=f_thick, circle_radius=f_rad)
+ left_eye_draw = DrawingSpec(color=(180, 200, 10), thickness=f_thick, circle_radius=f_rad)
+ left_eyebrow_draw = DrawingSpec(color=(180, 220, 10), thickness=f_thick, circle_radius=f_rad)
+ mouth_draw = DrawingSpec(color=(10, 180, 10), thickness=f_thick, circle_radius=f_rad)
+ head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad)
+
+ # mp_face_mesh.FACEMESH_CONTOURS has all the items we care about.
+ face_connection_spec = {}
+ for edge in mp_face_mesh.FACEMESH_FACE_OVAL:
+ face_connection_spec[edge] = head_draw
+ for edge in mp_face_mesh.FACEMESH_LEFT_EYE:
+ face_connection_spec[edge] = left_eye_draw
+ for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW:
+ face_connection_spec[edge] = left_eyebrow_draw
+ # for edge in mp_face_mesh.FACEMESH_LEFT_IRIS:
+ # face_connection_spec[edge] = left_iris_draw
+ for edge in mp_face_mesh.FACEMESH_RIGHT_EYE:
+ face_connection_spec[edge] = right_eye_draw
+ for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW:
+ face_connection_spec[edge] = right_eyebrow_draw
+ # for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS:
+ # face_connection_spec[edge] = right_iris_draw
+ for edge in mp_face_mesh.FACEMESH_LIPS:
+ face_connection_spec[edge] = mouth_draw
+ iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw}
+
+
+def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2):
+ """We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all
+ landmarks. Until our PR is merged into mediapipe, we need this separate method."""
+ if len(image.shape) != 3:
+ raise ValueError("Input image must be H,W,C.")
+ image_rows, image_cols, image_channels = image.shape
+ if image_channels != 3: # BGR channels
+ raise ValueError('Input image must contain three channel bgr data.')
+ for idx, landmark in enumerate(landmark_list.landmark):
+ if (
+ (landmark.HasField('visibility') and landmark.visibility < 0.9) or
+ (landmark.HasField('presence') and landmark.presence < 0.5)
+ ):
+ continue
+ if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0:
+ continue
+ image_x = int(image_cols*landmark.x)
+ image_y = int(image_rows*landmark.y)
+ draw_color = None
+ if isinstance(drawing_spec, Mapping):
+ if drawing_spec.get(idx) is None:
+ continue
+ else:
+ draw_color = drawing_spec[idx].color
+ elif isinstance(drawing_spec, DrawingSpec):
+ draw_color = drawing_spec.color
+ image[image_y-halfwidth:image_y+halfwidth, image_x-halfwidth:image_x+halfwidth, :] = draw_color
+
+
+def reverse_channels(image):
+ """Given a numpy array in RGB form, convert to BGR. Will also convert from BGR to RGB."""
+ # im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order.
+ # im[:,:,::[2,1,0]] would also work but makes a copy of the data.
+ return image[:, :, ::-1]
+
+
+def generate_annotation(
+ img_rgb,
+ max_faces: int,
+ min_confidence: float
+):
+ """
+ Find up to 'max_faces' inside the provided input image.
+ If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many
+ pixels in the image.
+ """
+ with mp_face_mesh.FaceMesh(
+ static_image_mode=True,
+ max_num_faces=max_faces,
+ refine_landmarks=True,
+ min_detection_confidence=min_confidence,
+ ) as facemesh:
+ img_height, img_width, img_channels = img_rgb.shape
+ assert(img_channels == 3)
+
+ results = facemesh.process(img_rgb).multi_face_landmarks
+
+ if results is None:
+ print("No faces detected in controlnet image for Mediapipe face annotator.")
+ return numpy.zeros_like(img_rgb)
+
+ # Filter faces that are too small
+ filtered_landmarks = []
+ for lm in results:
+ landmarks = lm.landmark
+ face_rect = [
+ landmarks[0].x,
+ landmarks[0].y,
+ landmarks[0].x,
+ landmarks[0].y,
+ ] # Left, up, right, down.
+ for i in range(len(landmarks)):
+ face_rect[0] = min(face_rect[0], landmarks[i].x)
+ face_rect[1] = min(face_rect[1], landmarks[i].y)
+ face_rect[2] = max(face_rect[2], landmarks[i].x)
+ face_rect[3] = max(face_rect[3], landmarks[i].y)
+ if min_face_size_pixels > 0:
+ face_width = abs(face_rect[2] - face_rect[0])
+ face_height = abs(face_rect[3] - face_rect[1])
+ face_width_pixels = face_width * img_width
+ face_height_pixels = face_height * img_height
+ face_size = min(face_width_pixels, face_height_pixels)
+ if face_size >= min_face_size_pixels:
+ filtered_landmarks.append(lm)
+ else:
+ filtered_landmarks.append(lm)
+
+ # Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start.
+ empty = numpy.zeros_like(img_rgb)
+
+ # Draw detected faces:
+ for face_landmarks in filtered_landmarks:
+ mp_drawing.draw_landmarks(
+ empty,
+ face_landmarks,
+ connections=face_connection_spec.keys(),
+ landmark_drawing_spec=None,
+ connection_drawing_spec=face_connection_spec
+ )
+ draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)
+
+ # Flip BGR back to RGB.
+ empty = reverse_channels(empty).copy()
+
+ return empty
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6379a84ab2aae38a52462683aa8f0ce574710444
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/__init__.py
@@ -0,0 +1,48 @@
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import resize_image_with_pad, common_input_validate, HWC3, custom_hf_download, MESH_GRAPHORMER_MODEL_NAME
+from custom_controlnet_aux.mesh_graphormer.pipeline import MeshGraphormerMediapipe, args
+import random, torch
+
+def set_seed(seed, n_gpu):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ if n_gpu > 0:
+ torch.cuda.manual_seed_all(seed)
+
+class MeshGraphormerDetector:
+ def __init__(self, pipeline):
+ self.pipeline = pipeline
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=MESH_GRAPHORMER_MODEL_NAME, filename="graphormer_hand_state_dict.bin", hrnet_filename="hrnetv2_w64_imagenet_pretrained.pth", detect_thr=0.6, presence_thr=0.6):
+ args.resume_checkpoint = custom_hf_download(pretrained_model_or_path, filename)
+ args.hrnet_checkpoint = custom_hf_download(pretrained_model_or_path, hrnet_filename)
+ pipeline = MeshGraphormerMediapipe(args, detect_thr=detect_thr, presence_thr=presence_thr)
+ return cls(pipeline)
+
+ def to(self, device):
+ self.pipeline._model.to(device)
+ self.pipeline.mano_model.to(device)
+ self.pipeline.mano_model.layer.to(device)
+ return self
+
+ def __call__(self, input_image=None, mask_bbox_padding=30, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", seed=88, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ set_seed(seed, 0)
+ depth_map, mask, info = self.pipeline.get_depth(input_image, mask_bbox_padding)
+ if depth_map is None:
+ depth_map = np.zeros_like(input_image)
+ mask = np.zeros_like(input_image)
+
+ #The hand is small
+ depth_map, mask = HWC3(depth_map), HWC3(mask)
+ depth_map, remove_pad = resize_image_with_pad(depth_map, detect_resolution, upscale_method)
+ depth_map = remove_pad(depth_map)
+ if output_type == "pil":
+ depth_map = Image.fromarray(depth_map)
+ mask = Image.fromarray(mask)
+
+ return depth_map, mask, info
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ef9304c8ccb036869fc7d23538fb0cb5782959c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml
@@ -0,0 +1,92 @@
+GPUS: (0,1,2,3)
+LOG_DIR: 'log/'
+DATA_DIR: ''
+OUTPUT_DIR: 'output/'
+WORKERS: 4
+PRINT_FREQ: 1000
+
+MODEL:
+ NAME: cls_hrnet
+ IMAGE_SIZE:
+ - 224
+ - 224
+ EXTRA:
+ STAGE1:
+ NUM_MODULES: 1
+ NUM_RANCHES: 1
+ BLOCK: BOTTLENECK
+ NUM_BLOCKS:
+ - 4
+ NUM_CHANNELS:
+ - 64
+ FUSE_METHOD: SUM
+ STAGE2:
+ NUM_MODULES: 1
+ NUM_BRANCHES: 2
+ BLOCK: BASIC
+ NUM_BLOCKS:
+ - 4
+ - 4
+ NUM_CHANNELS:
+ - 64
+ - 128
+ FUSE_METHOD: SUM
+ STAGE3:
+ NUM_MODULES: 4
+ NUM_BRANCHES: 3
+ BLOCK: BASIC
+ NUM_BLOCKS:
+ - 4
+ - 4
+ - 4
+ NUM_CHANNELS:
+ - 64
+ - 128
+ - 256
+ FUSE_METHOD: SUM
+ STAGE4:
+ NUM_MODULES: 3
+ NUM_BRANCHES: 4
+ BLOCK: BASIC
+ NUM_BLOCKS:
+ - 4
+ - 4
+ - 4
+ - 4
+ NUM_CHANNELS:
+ - 64
+ - 128
+ - 256
+ - 512
+ FUSE_METHOD: SUM
+CUDNN:
+ BENCHMARK: true
+ DETERMINISTIC: false
+ ENABLED: true
+DATASET:
+ DATASET: 'imagenet'
+ DATA_FORMAT: 'jpg'
+ ROOT: 'data/imagenet/'
+ TEST_SET: 'val'
+ TRAIN_SET: 'train'
+TEST:
+ BATCH_SIZE_PER_GPU: 32
+ MODEL_FILE: ''
+TRAIN:
+ BATCH_SIZE_PER_GPU: 32
+ BEGIN_EPOCH: 0
+ END_EPOCH: 100
+ RESUME: true
+ LR_FACTOR: 0.1
+ LR_STEP:
+ - 30
+ - 60
+ - 90
+ OPTIMIZER: sgd
+ LR: 0.05
+ WD: 0.0001
+ MOMENTUM: 0.9
+ NESTEROV: true
+ SHUFFLE: true
+DEBUG:
+ DEBUG: false
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/depth_preprocessor.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/depth_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfad80c8547d5bca7b9eddc69db9bffff4eac4d3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/depth_preprocessor.py
@@ -0,0 +1,6 @@
+class Preprocessor:
+ def __init__(self) -> None:
+ pass
+
+ def get_depth(self, input_dir, file_name):
+ return
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/hand_landmarker.task b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/hand_landmarker.task
new file mode 100644
index 0000000000000000000000000000000000000000..5ecab741879892d97c2f90bbf03bf55d7213db7c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/hand_landmarker.task
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbc2a30080c3c557093b5ddfc334698132eb341044ccee322ccf8bcf3607cde1
+size 7819105
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/pipeline.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..df86d0af13bb87b0c0dfaa2ef21c10fe81dfe711
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mesh_graphormer/pipeline.py
@@ -0,0 +1,472 @@
+import os
+import torch
+import gc
+import numpy as np
+from custom_controlnet_aux.mesh_graphormer.depth_preprocessor import Preprocessor
+
+import torchvision.models as models
+from custom_mesh_graphormer.modeling.bert import BertConfig, Graphormer
+from custom_mesh_graphormer.modeling.bert import Graphormer_Hand_Network as Graphormer_Network
+from custom_mesh_graphormer.modeling._mano import MANO, Mesh
+from custom_mesh_graphormer.modeling.hrnet.hrnet_cls_net_gridfeat import get_cls_net_gridfeat
+from custom_mesh_graphormer.modeling.hrnet.config import config as hrnet_config
+from custom_mesh_graphormer.modeling.hrnet.config import update_config as hrnet_update_config
+from custom_mesh_graphormer.utils.miscellaneous import set_seed
+from argparse import Namespace
+from pathlib import Path
+import cv2
+from torchvision import transforms
+import numpy as np
+import cv2
+from trimesh import Trimesh
+from trimesh.ray.ray_triangle import RayMeshIntersector
+import mediapipe as mp
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+from torchvision import transforms
+from pathlib import Path
+from custom_controlnet_aux.util import custom_hf_download
+import custom_mesh_graphormer
+from comfy.model_management import soft_empty_cache
+from packaging import version
+
+args = Namespace(
+ num_workers=4,
+ img_scale_factor=1,
+ image_file_or_path=os.path.join('', 'MeshGraphormer', 'samples', 'hand'),
+ model_name_or_path=str(Path(custom_mesh_graphormer.__file__).parent / "modeling/bert/bert-base-uncased"),
+ resume_checkpoint=None,
+ output_dir='output/',
+ config_name='',
+ a='hrnet-w64',
+ arch='hrnet-w64',
+ num_hidden_layers=4,
+ hidden_size=-1,
+ num_attention_heads=4,
+ intermediate_size=-1,
+ input_feat_dim='2051,512,128',
+ hidden_feat_dim='1024,256,64',
+ which_gcn='0,0,1',
+ mesh_type='hand',
+ run_eval_only=True,
+ device="cpu",
+ seed=88,
+ hrnet_checkpoint=custom_hf_download("hr16/ControlNet-HandRefiner-pruned", 'hrnetv2_w64_imagenet_pretrained.pth')
+)
+
+#Since mediapipe v0.10.5, the hand category has been correct
+if version.parse(mp.__version__) >= version.parse('0.10.5'):
+ true_hand_category = {"Right": "right", "Left": "left"}
+else:
+ true_hand_category = {"Right": "left", "Left": "right"}
+
+class MeshGraphormerMediapipe(Preprocessor):
+ def __init__(self, args=args, detect_thr=0.6, presence_thr=0.6) -> None:
+ #global logger
+ # Setup CUDA, GPU & distributed training
+ args.num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+ os.environ['OMP_NUM_THREADS'] = str(args.num_workers)
+ print('set os.environ[OMP_NUM_THREADS] to {}'.format(os.environ['OMP_NUM_THREADS']))
+
+ #mkdir(args.output_dir)
+ #logger = setup_logger("Graphormer", args.output_dir, get_rank())
+ set_seed(args.seed, args.num_gpus)
+ #logger.info("Using {} GPUs".format(args.num_gpus))
+
+ # Mesh and MANO utils
+ mano_model = MANO().to(args.device)
+ mano_model.layer = mano_model.layer.to(args.device)
+ mesh_sampler = Mesh(device=args.device)
+
+ # Renderer for visualization
+ # renderer = Renderer(faces=mano_model.face)
+
+ # Load pretrained model
+ trans_encoder = []
+
+ input_feat_dim = [int(item) for item in args.input_feat_dim.split(',')]
+ hidden_feat_dim = [int(item) for item in args.hidden_feat_dim.split(',')]
+ output_feat_dim = input_feat_dim[1:] + [3]
+
+ # which encoder block to have graph convs
+ which_blk_graph = [int(item) for item in args.which_gcn.split(',')]
+
+ if args.run_eval_only==True and args.resume_checkpoint!=None and args.resume_checkpoint!='None' and 'state_dict' not in args.resume_checkpoint:
+ # if only run eval, load checkpoint
+ #logger.info("Evaluation: Loading from checkpoint {}".format(args.resume_checkpoint))
+ _model = torch.load(args.resume_checkpoint)
+
+ else:
+ # init three transformer-encoder blocks in a loop
+ for i in range(len(output_feat_dim)):
+ config_class, model_class = BertConfig, Graphormer
+ config = config_class.from_pretrained(args.config_name if args.config_name \
+ else args.model_name_or_path)
+
+ config.output_attentions = False
+ config.img_feature_dim = input_feat_dim[i]
+ config.output_feature_dim = output_feat_dim[i]
+ args.hidden_size = hidden_feat_dim[i]
+ args.intermediate_size = int(args.hidden_size*2)
+
+ if which_blk_graph[i]==1:
+ config.graph_conv = True
+ #logger.info("Add Graph Conv")
+ else:
+ config.graph_conv = False
+
+ config.mesh_type = args.mesh_type
+
+ # update model structure if specified in arguments
+ update_params = ['num_hidden_layers', 'hidden_size', 'num_attention_heads', 'intermediate_size']
+ for idx, param in enumerate(update_params):
+ arg_param = getattr(args, param)
+ config_param = getattr(config, param)
+ if arg_param > 0 and arg_param != config_param:
+ #logger.info("Update config parameter {}: {} -> {}".format(param, config_param, arg_param))
+ setattr(config, param, arg_param)
+
+ # init a transformer encoder and append it to a list
+ assert config.hidden_size % config.num_attention_heads == 0
+ model = model_class(config=config)
+ #logger.info("Init model from scratch.")
+ trans_encoder.append(model)
+
+ # create backbone model
+ if args.arch=='hrnet':
+ hrnet_yaml = Path(__file__).parent / 'cls_hrnet_w40_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
+ hrnet_checkpoint = args.hrnet_checkpoint
+ hrnet_update_config(hrnet_config, hrnet_yaml)
+ backbone = get_cls_net_gridfeat(hrnet_config, pretrained=hrnet_checkpoint)
+ #logger.info('=> loading hrnet-v2-w40 model')
+ elif args.arch=='hrnet-w64':
+ hrnet_yaml = Path(__file__).parent / 'cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
+ hrnet_checkpoint = args.hrnet_checkpoint
+ hrnet_update_config(hrnet_config, hrnet_yaml)
+ backbone = get_cls_net_gridfeat(hrnet_config, pretrained=hrnet_checkpoint)
+ #logger.info('=> loading hrnet-v2-w64 model')
+ else:
+ print("=> using pre-trained model '{}'".format(args.arch))
+ backbone = models.__dict__[args.arch](pretrained=True)
+ # remove the last fc layer
+ backbone = torch.nn.Sequential(*list(backbone.children())[:-1])
+
+ trans_encoder = torch.nn.Sequential(*trans_encoder)
+ total_params = sum(p.numel() for p in trans_encoder.parameters())
+ #logger.info('Graphormer encoders total parameters: {}'.format(total_params))
+ backbone_total_params = sum(p.numel() for p in backbone.parameters())
+ #logger.info('Backbone total parameters: {}'.format(backbone_total_params))
+
+ # build end-to-end Graphormer network (CNN backbone + multi-layer Graphormer encoder)
+ _model = Graphormer_Network(args, config, backbone, trans_encoder)
+
+ if args.resume_checkpoint!=None and args.resume_checkpoint!='None':
+ # for fine-tuning or resume training or inference, load weights from checkpoint
+ #logger.info("Loading state dict from checkpoint {}".format(args.resume_checkpoint))
+ # workaround approach to load sparse tensor in graph conv.
+ state_dict = torch.load(args.resume_checkpoint)
+ _model.load_state_dict(state_dict, strict=False)
+ del state_dict
+ gc.collect()
+ soft_empty_cache()
+
+ # update configs to enable attention outputs
+ setattr(_model.trans_encoder[-1].config,'output_attentions', True)
+ setattr(_model.trans_encoder[-1].config,'output_hidden_states', True)
+ _model.trans_encoder[-1].bert.encoder.output_attentions = True
+ _model.trans_encoder[-1].bert.encoder.output_hidden_states = True
+ for iter_layer in range(4):
+ _model.trans_encoder[-1].bert.encoder.layer[iter_layer].attention.self.output_attentions = True
+ for inter_block in range(3):
+ setattr(_model.trans_encoder[-1].config,'device', args.device)
+
+ _model.to(args.device)
+ self._model = _model
+ self.mano_model = mano_model
+ self.mesh_sampler = mesh_sampler
+
+ self.transform = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize(
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225])])
+ #Fix File loading is not yet supported on Windows
+ with open(str( Path(__file__).parent / "hand_landmarker.task" ), 'rb') as file:
+ model_data = file.read()
+ base_options = python.BaseOptions(model_asset_buffer=model_data)
+ options = vision.HandLandmarkerOptions(base_options=base_options,
+ min_hand_detection_confidence=detect_thr,
+ min_hand_presence_confidence=presence_thr,
+ min_tracking_confidence=0.6,
+ num_hands=2)
+
+ self.detector = vision.HandLandmarker.create_from_options(options)
+
+
+ def get_rays(self, W, H, fx, fy, cx, cy, c2w_t, center_pixels): # rot = I
+
+ j, i = np.meshgrid(np.arange(H, dtype=np.float32), np.arange(W, dtype=np.float32))
+ if center_pixels:
+ i = i.copy() + 0.5
+ j = j.copy() + 0.5
+
+ directions = np.stack([(i - cx) / fx, (j - cy) / fy, np.ones_like(i)], -1)
+ directions /= np.linalg.norm(directions, axis=-1, keepdims=True)
+
+ rays_o = np.expand_dims(c2w_t,0).repeat(H*W, 0)
+
+ rays_d = directions # (H, W, 3)
+ rays_d = (rays_d / np.linalg.norm(rays_d, axis=-1, keepdims=True)).reshape(-1,3)
+
+ return rays_o, rays_d
+
+ def get_mask_bounding_box(self, extrema, H, W, padding=30, dynamic_resize=0.15):
+ x_min, x_max, y_min, y_max = extrema
+ bb_xpad = max(int((x_max - x_min + 1) * dynamic_resize), padding)
+ bb_ypad = max(int((y_max - y_min + 1) * dynamic_resize), padding)
+ bbx_min = np.max((x_min - bb_xpad, 0))
+ bbx_max = np.min((x_max + bb_xpad, W-1))
+ bby_min = np.max((y_min - bb_ypad, 0))
+ bby_max = np.min((y_max + bb_ypad, H-1))
+ return bbx_min, bbx_max, bby_min, bby_max
+
+ def run_inference(self, img, Graphormer_model, mano, mesh_sampler, scale, crop_len):
+ global args
+ H, W = int(crop_len), int(crop_len)
+ Graphormer_model.eval()
+ mano.eval()
+ device = next(Graphormer_model.parameters()).device
+ with torch.no_grad():
+ img_tensor = self.transform(img)
+ batch_imgs = torch.unsqueeze(img_tensor, 0).to(device)
+
+ # forward-pass
+ pred_camera, pred_3d_joints, pred_vertices_sub, pred_vertices, hidden_states, att = Graphormer_model(batch_imgs, mano, mesh_sampler)
+
+ # obtain 3d joints, which are regressed from the full mesh
+ pred_3d_joints_from_mesh = mano.get_3d_joints(pred_vertices)
+ # obtain 2d joints, which are projected from 3d joints of mesh
+ #pred_2d_joints_from_mesh = orthographic_projection(pred_3d_joints_from_mesh.contiguous(), pred_camera.contiguous())
+ #pred_2d_coarse_vertices_from_mesh = orthographic_projection(pred_vertices_sub.contiguous(), pred_camera.contiguous())
+ pred_camera = pred_camera.cpu()
+ pred_vertices = pred_vertices.cpu()
+ mesh = Trimesh(vertices=pred_vertices[0], faces=mano.face)
+ res = crop_len
+ focal_length = 1000 * scale
+ camera_t = np.array([-pred_camera[1], -pred_camera[2], -2*focal_length/(res * pred_camera[0] +1e-9)])
+ pred_3d_joints_camera = pred_3d_joints_from_mesh.cpu()[0] - camera_t
+ z_3d_dist = pred_3d_joints_camera[:,2].clone()
+
+ pred_2d_joints_img_space = ((pred_3d_joints_camera/z_3d_dist[:,None]) * np.array((focal_length, focal_length, 1)))[:,:2] + np.array((W/2, H/2))
+
+ rays_o, rays_d = self.get_rays(W, H, focal_length, focal_length, W/2, H/2, camera_t, True)
+ coords = np.array(list(np.ndindex(H,W))).reshape(H,W,-1).transpose(1,0,2).reshape(-1,2)
+ intersector = RayMeshIntersector(mesh)
+ points, index_ray, _ = intersector.intersects_location(rays_o, rays_d, multiple_hits=False)
+
+ tri_index = intersector.intersects_first(rays_o, rays_d)
+
+ tri_index = tri_index[index_ray]
+
+ assert len(index_ray) == len(tri_index)
+
+ discriminator = (np.sum(mesh.face_normals[tri_index]* rays_d[index_ray], axis=-1)<= 0)
+ points = points[discriminator] # ray intesects in interior faces, discard them
+
+ if len(points) == 0:
+ return None, None
+ depth = (points + camera_t)[:,-1]
+ index_ray = index_ray[discriminator]
+ pixel_ray = coords[index_ray]
+
+ minval = np.min(depth)
+ maxval = np.max(depth)
+ depthmap = np.zeros([H,W])
+
+ depthmap[pixel_ray[:, 0], pixel_ray[:, 1]] = 1.0 - (0.8 * (depth - minval) / (maxval - minval))
+ depthmap *= 255
+ return depthmap, pred_2d_joints_img_space
+
+
+ def get_depth(self, np_image, padding):
+ info = {}
+
+ # STEP 3: Load the input image.
+ #https://stackoverflow.com/a/76407270
+ image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_image.copy())
+
+ # STEP 4: Detect hand landmarks from the input image.
+ detection_result = self.detector.detect(image)
+
+ handedness_list = detection_result.handedness
+ hand_landmarks_list = detection_result.hand_landmarks
+
+ raw_image = image.numpy_view()
+ H, W, C = raw_image.shape
+
+
+ # HANDLANDMARKS CAN BE EMPTY, HANDLE THIS!
+ if len(hand_landmarks_list) == 0:
+ return None, None, None
+ raw_image = raw_image[:, :, :3]
+
+ padded_image = np.zeros((H*2, W*2, 3))
+ padded_image[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)] = raw_image
+
+ hand_landmarks_list, handedness_list = zip(
+ *sorted(
+ zip(hand_landmarks_list, handedness_list), key=lambda x: x[0][9].z, reverse=True
+ )
+ )
+
+ padded_depthmap = np.zeros((H*2, W*2))
+ mask = np.zeros((H, W))
+ crop_boxes = []
+ #bboxes = []
+ groundtruth_2d_keypoints = []
+ hands = []
+ depth_failure = False
+ crop_lens = []
+ abs_boxes = []
+
+ for idx in range(len(hand_landmarks_list)):
+ hand = true_hand_category[handedness_list[idx][0].category_name]
+ hands.append(hand)
+ hand_landmarks = hand_landmarks_list[idx]
+ handedness = handedness_list[idx]
+ height, width, _ = raw_image.shape
+ x_coordinates = [landmark.x for landmark in hand_landmarks]
+ y_coordinates = [landmark.y for landmark in hand_landmarks]
+
+ # x_min, x_max, y_min, y_max: extrema from mediapipe keypoint detection
+ x_min = int(min(x_coordinates) * width)
+ x_max = int(max(x_coordinates) * width)
+ x_c = (x_min + x_max)//2
+ y_min = int(min(y_coordinates) * height)
+ y_max = int(max(y_coordinates) * height)
+ y_c = (y_min + y_max)//2
+ abs_boxes.append([x_min, x_max, y_min, y_max])
+
+ #if x_max - x_min < 60 or y_max - y_min < 60:
+ # continue
+
+ crop_len = (max(x_max - x_min, y_max - y_min) * 1.6) //2 * 2
+
+ # crop_x_min, crop_x_max, crop_y_min, crop_y_max: bounding box for mesh reconstruction
+ crop_x_min = int(x_c - (crop_len/2 - 1) + W/2)
+ crop_x_max = int(x_c + crop_len/2 + W/2)
+ crop_y_min = int(y_c - (crop_len/2 - 1) + H/2)
+ crop_y_max = int(y_c + crop_len/2 + H/2)
+
+ cropped = padded_image[crop_y_min:crop_y_max+1, crop_x_min:crop_x_max+1]
+ crop_boxes.append([crop_y_min, crop_y_max, crop_x_min, crop_x_max])
+ crop_lens.append(crop_len)
+ if hand == "left":
+ cropped = cv2.flip(cropped, 1)
+
+ if crop_len < 224:
+ graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_CUBIC)
+ else:
+ graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_AREA)
+ scale = crop_len/224
+ cropped_depthmap, pred_2d_keypoints = self.run_inference(graphormer_input.astype(np.uint8), self._model, self.mano_model, self.mesh_sampler, scale, int(crop_len))
+
+ if cropped_depthmap is None:
+ depth_failure = True
+ break
+ #keypoints_image_space = pred_2d_keypoints * (crop_y_max - crop_y_min + 1)/224
+ groundtruth_2d_keypoints.append(pred_2d_keypoints)
+
+ if hand == "left":
+ cropped_depthmap = cv2.flip(cropped_depthmap, 1)
+ resized_cropped_depthmap = cv2.resize(cropped_depthmap, (int(crop_len), int(crop_len)), interpolation=cv2.INTER_LINEAR)
+ nonzero_y, nonzero_x = (resized_cropped_depthmap != 0).nonzero()
+ if len(nonzero_y) == 0 or len(nonzero_x) == 0:
+ depth_failure = True
+ break
+ padded_depthmap[crop_y_min+nonzero_y, crop_x_min+nonzero_x] = resized_cropped_depthmap[nonzero_y, nonzero_x]
+
+ # nonzero stands for nonzero value on the depth map
+ # coordinates of nonzero depth pixels in original image space
+ original_nonzero_x = crop_x_min+nonzero_x - int(W/2)
+ original_nonzero_y = crop_y_min+nonzero_y - int(H/2)
+
+ nonzerox_min = min(np.min(original_nonzero_x), x_min)
+ nonzerox_max = max(np.max(original_nonzero_x), x_max)
+ nonzeroy_min = min(np.min(original_nonzero_y), y_min)
+ nonzeroy_max = max(np.max(original_nonzero_y), y_max)
+
+ bbx_min, bbx_max, bby_min, bby_max = self.get_mask_bounding_box((nonzerox_min, nonzerox_max, nonzeroy_min, nonzeroy_max), H, W, padding)
+ mask[bby_min:bby_max+1, bbx_min:bbx_max+1] = 1.0
+ #bboxes.append([int(bbx_min), int(bbx_max), int(bby_min), int(bby_max)])
+ if depth_failure:
+ #print("cannot detect normal hands")
+ return None, None, None
+ depthmap = padded_depthmap[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)].astype(np.uint8)
+ mask = (255.0 * mask).astype(np.uint8)
+ info["groundtruth_2d_keypoints"] = groundtruth_2d_keypoints
+ info["hands"] = hands
+ info["crop_boxes"] = crop_boxes
+ info["crop_lens"] = crop_lens
+ info["abs_boxes"] = abs_boxes
+ return depthmap, mask, info
+
+ def get_keypoints(self, img, Graphormer_model, mano, mesh_sampler, scale, crop_len):
+ global args
+ H, W = int(crop_len), int(crop_len)
+ Graphormer_model.eval()
+ mano.eval()
+ device = next(Graphormer_model.parameters()).device
+ with torch.no_grad():
+ img_tensor = self.transform(img)
+ #print(img_tensor)
+ batch_imgs = torch.unsqueeze(img_tensor, 0).to(device)
+
+ # forward-pass
+ pred_camera, pred_3d_joints, pred_vertices_sub, pred_vertices, hidden_states, att = Graphormer_model(batch_imgs, mano, mesh_sampler)
+
+ # obtain 3d joints, which are regressed from the full mesh
+ pred_3d_joints_from_mesh = mano.get_3d_joints(pred_vertices)
+ # obtain 2d joints, which are projected from 3d joints of mesh
+ #pred_2d_joints_from_mesh = orthographic_projection(pred_3d_joints_from_mesh.contiguous(), pred_camera.contiguous())
+ #pred_2d_coarse_vertices_from_mesh = orthographic_projection(pred_vertices_sub.contiguous(), pred_camera.contiguous())
+ pred_camera = pred_camera.cpu()
+ pred_vertices = pred_vertices.cpu()
+ #
+ res = crop_len
+ focal_length = 1000 * scale
+ camera_t = np.array([-pred_camera[1], -pred_camera[2], -2*focal_length/(res * pred_camera[0] +1e-9)])
+ pred_3d_joints_camera = pred_3d_joints_from_mesh.cpu()[0] - camera_t
+ z_3d_dist = pred_3d_joints_camera[:,2].clone()
+ pred_2d_joints_img_space = ((pred_3d_joints_camera/z_3d_dist[:,None]) * np.array((focal_length, focal_length, 1)))[:,:2] + np.array((W/2, H/2))
+
+ return pred_2d_joints_img_space
+
+
+ def eval_mpjpe(self, sample, info):
+ H, W, C = sample.shape
+ padded_image = np.zeros((H*2, W*2, 3))
+ padded_image[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)] = sample
+ crop_boxes = info["crop_boxes"]
+ hands = info["hands"]
+ groundtruth_2d_keypoints = info["groundtruth_2d_keypoints"]
+ crop_lens = info["crop_lens"]
+ pjpe = 0
+ for i in range(len(crop_boxes)):#box in crop_boxes:
+ crop_y_min, crop_y_max, crop_x_min, crop_x_max = crop_boxes[i]
+ cropped = padded_image[crop_y_min:crop_y_max+1, crop_x_min:crop_x_max+1]
+ hand = hands[i]
+ if hand == "left":
+ cropped = cv2.flip(cropped, 1)
+ crop_len = crop_lens[i]
+ scale = crop_len/224
+ if crop_len < 224:
+ graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_CUBIC)
+ else:
+ graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_AREA)
+ generated_keypoint = self.get_keypoints(graphormer_input.astype(np.uint8), self._model, self.mano_model, self.mesh_sampler, scale, crop_len)
+ #generated_keypoint = generated_keypoint * ((crop_y_max - crop_y_min + 1)/224)
+ pjpe += np.sum(np.sqrt(np.sum(((generated_keypoint - groundtruth_2d_keypoints[i]) ** 2).numpy(), axis=1)))
+ pass
+ mpjpe = pjpe/(len(crop_boxes) * 21)
+ return mpjpe
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..387f6d3c7a3dc9867da68dc9f5d436d043b20372
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/__init__.py
@@ -0,0 +1,124 @@
+
+import torch
+import os
+from pathlib import Path
+
+CODE_SPACE=Path(os.path.dirname(os.path.abspath(__file__)))
+
+from custom_mmpkg.custom_mmcv.utils import Config, DictAction
+from custom_controlnet_aux.metric3d.mono.model.monodepth_model import get_configured_monodepth_model
+from custom_controlnet_aux.metric3d.mono.utils.running import load_ckpt
+from custom_controlnet_aux.metric3d.mono.utils.do_test import transform_test_data_scalecano, get_prediction
+import numpy as np
+from custom_controlnet_aux.metric3d.mono.utils.visualization import vis_surface_normal
+from einops import repeat
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, METRIC3D_MODEL_NAME
+import re
+import matplotlib.pyplot as plt
+
+def load_model(model_selection, model_path):
+ if model_selection == "vit-small":
+ cfg = Config.fromfile(CODE_SPACE / 'mono/configs/HourglassDecoder/vit.raft5.small.py')
+ elif model_selection == "vit-large":
+ cfg = Config.fromfile(CODE_SPACE / 'mono/configs/HourglassDecoder/vit.raft5.large.py')
+ elif model_selection == "vit-giant2":
+ cfg = Config.fromfile(CODE_SPACE / 'mono/configs/HourglassDecoder/vit.raft5.giant2.py')
+ else:
+ raise NotImplementedError(f"metric3d model: {model_selection}")
+ model = get_configured_monodepth_model(cfg, )
+ model, _, _, _ = load_ckpt(model_path, model, strict_match=False)
+ model.eval()
+ model = model
+ return model, cfg
+
+def gray_to_colormap(img, cmap='rainbow'):
+ """
+ Transfer gray map to matplotlib colormap
+ """
+ assert img.ndim == 2
+
+ img[img<0] = 0
+ mask_invalid = img < 1e-10
+ img = img / (img.max() + 1e-8)
+ norm = plt.Normalize(vmin=0, vmax=1.1) # Use plt.Normalize instead of matplotlib.colors.Normalize
+ cmap_m = plt.get_cmap(cmap) # Access the colormap directly from plt
+ map = plt.cm.ScalarMappable(norm=norm, cmap=cmap_m)
+ colormap = (map.to_rgba(img)[:, :, :3] * 255).astype(np.uint8)
+ colormap[mask_invalid] = 0
+ return colormap
+
+def predict_depth_normal(model, cfg, np_img, fx=1000.0, fy=1000.0, state_cache={}):
+ intrinsic = [fx, fy, np_img.shape[1]/2, np_img.shape[0]/2]
+ rgb_input, cam_models_stacks, pad, label_scale_factor = transform_test_data_scalecano(np_img, intrinsic, cfg.data_basic, device=next(model.parameters()).device)
+
+ with torch.no_grad():
+ pred_depth, confidence, output = get_prediction(
+ model = model,
+ input = rgb_input.unsqueeze(0),
+ cam_model = cam_models_stacks,
+ pad_info = pad,
+ scale_info = label_scale_factor,
+ gt_depth = None,
+ normalize_scale = cfg.data_basic.depth_range[1],
+ ori_shape=[np_img.shape[0], np_img.shape[1]],
+ )
+
+ pred_normal = output['normal_out_list'][0][:, :3, :, :]
+ H, W = pred_normal.shape[2:]
+ pred_normal = pred_normal[:, :, pad[0]:H-pad[1], pad[2]:W-pad[3]]
+ pred_depth = pred_depth[:, :, pad[0]:H-pad[1], pad[2]:W-pad[3] ]
+
+ pred_depth = pred_depth.squeeze().cpu().numpy()
+ pred_color = gray_to_colormap(pred_depth, 'Greys')
+
+ pred_normal = torch.nn.functional.interpolate(pred_normal, [np_img.shape[0], np_img.shape[1]], mode='bilinear').squeeze()
+ pred_normal = pred_normal.permute(1,2,0)
+ pred_color_normal = vis_surface_normal(pred_normal)
+ pred_normal = pred_normal.cpu().numpy()
+
+ # Storing depth and normal map in state for potential 3D reconstruction
+ state_cache['depth'] = pred_depth
+ state_cache['normal'] = pred_normal
+ state_cache['img'] = np_img
+ state_cache['intrinsic'] = intrinsic
+ state_cache['confidence'] = confidence
+
+ return pred_color, pred_color_normal, state_cache
+
+class Metric3DDetector:
+ def __init__(self, model, cfg):
+ self.model = model
+ self.cfg = cfg
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=METRIC3D_MODEL_NAME, filename="metric_depth_vit_small_800k.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ backbone = re.findall(r"metric_depth_vit_(\w+)_", model_path)[0]
+ model, cfg = load_model(f'vit-{backbone}', model_path)
+ return cls(model, cfg)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, fx=1000, fy=1000, output_type=None, upscale_method="INTER_CUBIC", depth_and_normal=True, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+
+ depth_map, normal_map, _ = predict_depth_normal(self.model, self.cfg, input_image, fx=fx, fy=fy)
+ # ControlNet uses inverse depth and normal
+ depth_map, normal_map = depth_map, 255 - normal_map
+ depth_map, remove_pad = resize_image_with_pad(depth_map, detect_resolution, upscale_method)
+ normal_map, _ = resize_image_with_pad(normal_map, detect_resolution, upscale_method)
+ depth_map, normal_map = remove_pad(depth_map), remove_pad(normal_map)
+
+ if output_type == "pil":
+ depth_map = Image.fromarray(depth_map)
+ normal_map = Image.fromarray(normal_map)
+
+ if depth_and_normal:
+ return depth_map, normal_map
+ else:
+ return depth_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/convlarge.0.3_150.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/convlarge.0.3_150.py
new file mode 100644
index 0000000000000000000000000000000000000000..b06526e11c7b3930f352a17e09e0c37000f4b04b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/convlarge.0.3_150.py
@@ -0,0 +1,25 @@
+_base_=[
+ '../_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py',
+ '../_base_/datasets/_data_base_.py',
+ '../_base_/default_runtime.py',
+ ]
+
+model = dict(
+ backbone=dict(
+ pretrained=False,
+ )
+)
+
+# configs of the canonical space
+data_basic=dict(
+ canonical_space = dict(
+ img_size=(512, 960),
+ focal_length=1000.0,
+ ),
+ depth_range=(0, 1),
+ depth_normalize=(0.3, 150),
+ crop_size = (544, 1216),
+)
+
+batchsize_per_gpu = 2
+thread_per_gpu = 4
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/test_kitti_convlarge.0.3_150.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/test_kitti_convlarge.0.3_150.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25feb0e5980f8eed890a21e84d92c1a2f1f5241
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/test_kitti_convlarge.0.3_150.py
@@ -0,0 +1,25 @@
+_base_=[
+ '../_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py',
+ '../_base_/datasets/_data_base_.py',
+ '../_base_/default_runtime.py',
+ ]
+
+model = dict(
+ backbone=dict(
+ pretrained=False,
+ )
+)
+
+# configs of the canonical space
+data_basic=dict(
+ canonical_space = dict(
+ img_size=(512, 960),
+ focal_length=1000.0,
+ ),
+ depth_range=(0, 1),
+ depth_normalize=(0.3, 150),
+ crop_size = (512, 1088),
+)
+
+batchsize_per_gpu = 2
+thread_per_gpu = 4
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/test_nyu_convlarge.0.3_150.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/test_nyu_convlarge.0.3_150.py
new file mode 100644
index 0000000000000000000000000000000000000000..debe7213958a496046b86e6bd7cc00b115b6268e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/test_nyu_convlarge.0.3_150.py
@@ -0,0 +1,25 @@
+_base_=[
+ '../_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py',
+ '../_base_/datasets/_data_base_.py',
+ '../_base_/default_runtime.py',
+ ]
+
+model = dict(
+ backbone=dict(
+ pretrained=False,
+ )
+)
+
+# configs of the canonical space
+data_basic=dict(
+ canonical_space = dict(
+ img_size=(512, 960),
+ focal_length=1000.0,
+ ),
+ depth_range=(0, 1),
+ depth_normalize=(0.3, 150),
+ crop_size = (480, 1216),
+)
+
+batchsize_per_gpu = 2
+thread_per_gpu = 4
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.giant2.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.giant2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e85a1dc66cb99197d23d1318a348eb492e7c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.giant2.py
@@ -0,0 +1,32 @@
+_base_=[
+ '../_base_/models/encoder_decoder/dino_vit_giant2_reg.dpt_raft.py',
+ '../_base_/datasets/_data_base_.py',
+ '../_base_/default_runtime.py',
+ ]
+
+model=dict(
+ decode_head=dict(
+ type='RAFTDepthNormalDPT5',
+ iters=8,
+ n_downsample=2,
+ detach=False,
+ )
+)
+
+
+max_value = 200
+# configs of the canonical space
+data_basic=dict(
+ canonical_space = dict(
+ # img_size=(540, 960),
+ focal_length=1000.0,
+ ),
+ depth_range=(0, 1),
+ depth_normalize=(0.1, max_value),
+ crop_size = (616, 1064), # %28 = 0
+ clip_depth_range=(0.1, 200),
+ vit_size=(616,1064)
+)
+
+batchsize_per_gpu = 1
+thread_per_gpu = 1
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.large.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.large.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cb545b925dd1f484e5e8e5eb920a5ec8cec8b0e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.large.py
@@ -0,0 +1,32 @@
+_base_=[
+ '../_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py',
+ '../_base_/datasets/_data_base_.py',
+ '../_base_/default_runtime.py',
+ ]
+
+model=dict(
+ decode_head=dict(
+ type='RAFTDepthNormalDPT5',
+ iters=8,
+ n_downsample=2,
+ detach=False,
+ )
+)
+
+
+max_value = 200
+# configs of the canonical space
+data_basic=dict(
+ canonical_space = dict(
+ # img_size=(540, 960),
+ focal_length=1000.0,
+ ),
+ depth_range=(0, 1),
+ depth_normalize=(0.1, max_value),
+ crop_size = (616, 1064), # %28 = 0
+ clip_depth_range=(0.1, 200),
+ vit_size=(616,1064)
+)
+
+batchsize_per_gpu = 1
+thread_per_gpu = 1
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.small.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.small.py
new file mode 100644
index 0000000000000000000000000000000000000000..a55b412e19dd02ec88c72a1cba9dfe12652d4bf4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/HourglassDecoder/vit.raft5.small.py
@@ -0,0 +1,32 @@
+_base_=[
+ '../_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py',
+ '../_base_/datasets/_data_base_.py',
+ '../_base_/default_runtime.py',
+ ]
+
+model=dict(
+ decode_head=dict(
+ type='RAFTDepthNormalDPT5',
+ iters=4,
+ n_downsample=2,
+ detach=False,
+ )
+)
+
+
+max_value = 200
+# configs of the canonical space
+data_basic=dict(
+ canonical_space = dict(
+ # img_size=(540, 960),
+ focal_length=1000.0,
+ ),
+ depth_range=(0, 1),
+ depth_normalize=(0.1, max_value),
+ crop_size = (616, 1064), # %28 = 0
+ clip_depth_range=(0.1, 200),
+ vit_size=(616,1064)
+)
+
+batchsize_per_gpu = 1
+thread_per_gpu = 1
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f5a12faa99758192ecc4ed3fc22c9249232e86
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/__init__.py
@@ -0,0 +1 @@
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/_data_base_.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/_data_base_.py
new file mode 100644
index 0000000000000000000000000000000000000000..69526356b7551287c48f30d77d0d7af1055d59a8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/_data_base_.py
@@ -0,0 +1,13 @@
+# canonical camera setting and basic data setting
+# we set it same as the E300 camera (crop version)
+#
+data_basic=dict(
+ canonical_space = dict(
+ img_size=(540, 960),
+ focal_length=1196.0,
+ ),
+ depth_range=(0.9, 150),
+ depth_normalize=(0.006, 1.001),
+ crop_size = (512, 960),
+ clip_depth_range=(0.9, 150),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/datasets/_data_base_.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/datasets/_data_base_.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba18deaa90e363605c89510b3c3ef9d415b9fc6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/datasets/_data_base_.py
@@ -0,0 +1,12 @@
+# canonical camera setting and basic data setting
+#
+data_basic=dict(
+ canonical_space = dict(
+ img_size=(540, 960),
+ focal_length=1196.0,
+ ),
+ depth_range=(0.9, 150),
+ depth_normalize=(0.006, 1.001),
+ crop_size = (512, 960),
+ clip_depth_range=(0.9, 150),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/default_runtime.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bca22e03c5b4b881a975d636d5ba383706e2db2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/default_runtime.py
@@ -0,0 +1,4 @@
+
+load_from = None
+cudnn_benchmark = True
+test_metrics = ['abs_rel', 'rmse', 'silog', 'delta1', 'delta2', 'delta3','rmse_log', 'log10', 'sq_rel']
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/convnext_large.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/convnext_large.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d7ff07b97df8d03155d76e17ed407d6fb92656
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/convnext_large.py
@@ -0,0 +1,16 @@
+#_base_ = ['./_model_base_.py',]
+
+#'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth'
+model = dict(
+ #type='EncoderDecoderAuxi',
+ backbone=dict(
+ type='convnext_large',
+ pretrained=True,
+ in_22k=True,
+ out_indices=[0, 1, 2, 3],
+ drop_path_rate=0.4,
+ layer_scale_init_value=1.0,
+ checkpoint='data/pretrained_weight_repo/convnext/convnext_large_22k_1k_384.pth',
+ prefix='backbones.',
+ out_channels=[192, 384, 768, 1536]),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_giant2_reg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_giant2_reg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f11d57f3fb98e5483ad197e67e075d059678528
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_giant2_reg.py
@@ -0,0 +1,7 @@
+model = dict(
+ backbone=dict(
+ type='vit_giant2_reg',
+ prefix='backbones.',
+ out_channels=[1536, 1536, 1536, 1536],
+ drop_path_rate = 0.0),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_large.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_large.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1711441b3542eb64eafb6a36012dd900734526
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_large.py
@@ -0,0 +1,7 @@
+model = dict(
+ backbone=dict(
+ type='vit_large',
+ prefix='backbones.',
+ out_channels=[1024, 1024, 1024, 1024],
+ drop_path_rate = 0.0),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_large_reg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_large_reg.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f007eec1f4165d6861531d1b036f09c9561deff
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_large_reg.py
@@ -0,0 +1,7 @@
+model = dict(
+ backbone=dict(
+ type='vit_large_reg',
+ prefix='backbones.',
+ out_channels=[1024, 1024, 1024, 1024],
+ drop_path_rate = 0.0),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_small_reg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_small_reg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1765cef976089d84099455efcde58afc43320880
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/backbones/dino_vit_small_reg.py
@@ -0,0 +1,7 @@
+model = dict(
+ backbone=dict(
+ type='vit_small_reg',
+ prefix='backbones.',
+ out_channels=[384, 384, 384, 384],
+ drop_path_rate = 0.0),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d80d93a68607f4324474f2ec6fd73360e7d2b3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/convnext_large.hourglassdecoder.py
@@ -0,0 +1,10 @@
+# model settings
+_base_ = ['../backbones/convnext_large.py',]
+model = dict(
+ type='DensePredModel',
+ decode_head=dict(
+ type='HourglassDecoder',
+ in_channels=[192, 384, 768, 1536],
+ decoder_channel=[128, 128, 256, 512],
+ prefix='decode_heads.'),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_giant2_reg.dpt_raft.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_giant2_reg.dpt_raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..36e11a31312f9a921a43d7f46c8e9bcd6fa1139c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_giant2_reg.dpt_raft.py
@@ -0,0 +1,19 @@
+# model settings
+_base_ = ['../backbones/dino_vit_giant2_reg.py']
+model = dict(
+ type='DensePredModel',
+ decode_head=dict(
+ type='RAFTDepthDPT',
+ in_channels=[1536, 1536, 1536, 1536],
+ use_cls_token=True,
+ feature_channels = [384, 768, 1536, 1536], # [2/7, 1/7, 1/14, 1/14]
+ decoder_channels = [192, 384, 768, 1536, 1536], # [4/7, 2/7, 1/7, 1/14, 1/14]
+ up_scale = 7,
+ hidden_channels=[192, 192, 192, 192], # [x_4, x_8, x_16, x_32] [192, 384, 768, 1536]
+ n_gru_layers=3,
+ n_downsample=2,
+ iters=3,
+ slow_fast_gru=True,
+ num_register_tokens=4,
+ prefix='decode_heads.'),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_large.dpt_raft.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_large.dpt_raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ea9d505baa7f3d4cae4b22ec97ed7d0fe63f0e1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_large.dpt_raft.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = ['../backbones/dino_vit_large.py']
+model = dict(
+ type='DensePredModel',
+ decode_head=dict(
+ type='RAFTDepthDPT',
+ in_channels=[1024, 1024, 1024, 1024],
+ use_cls_token=True,
+ feature_channels = [256, 512, 1024, 1024], # [2/7, 1/7, 1/14, 1/14]
+ decoder_channels = [128, 256, 512, 1024, 1024], # [4/7, 2/7, 1/7, 1/14, 1/14]
+ up_scale = 7,
+ hidden_channels=[128, 128, 128, 128], # [x_4, x_8, x_16, x_32] [192, 384, 768, 1536]
+ n_gru_layers=3,
+ n_downsample=2,
+ iters=12,
+ slow_fast_gru=True,
+ corr_radius=4,
+ corr_levels=4,
+ prefix='decode_heads.'),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2b28c89f783ef6177e9d928e04581838c3b56c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_large_reg.dpt_raft.py
@@ -0,0 +1,19 @@
+# model settings
+_base_ = ['../backbones/dino_vit_large_reg.py']
+model = dict(
+ type='DensePredModel',
+ decode_head=dict(
+ type='RAFTDepthDPT',
+ in_channels=[1024, 1024, 1024, 1024],
+ use_cls_token=True,
+ feature_channels = [256, 512, 1024, 1024], # [2/7, 1/7, 1/14, 1/14]
+ decoder_channels = [128, 256, 512, 1024, 1024], # [4/7, 2/7, 1/7, 1/14, 1/14]
+ up_scale = 7,
+ hidden_channels=[128, 128, 128, 128], # [x_4, x_8, x_16, x_32] [192, 384, 768, 1536]
+ n_gru_layers=3,
+ n_downsample=2,
+ iters=3,
+ slow_fast_gru=True,
+ num_register_tokens=4,
+ prefix='decode_heads.'),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a12558d8122da93f7cdcad7d7872bb0ee7c8aa9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/configs/_base_/models/encoder_decoder/dino_vit_small_reg.dpt_raft.py
@@ -0,0 +1,19 @@
+# model settings
+_base_ = ['../backbones/dino_vit_small_reg.py']
+model = dict(
+ type='DensePredModel',
+ decode_head=dict(
+ type='RAFTDepthDPT',
+ in_channels=[384, 384, 384, 384],
+ use_cls_token=True,
+ feature_channels = [96, 192, 384, 768], # [2/7, 1/7, 1/14, 1/14]
+ decoder_channels = [48, 96, 192, 384, 384], # [-, 1/4, 1/7, 1/14, 1/14]
+ up_scale = 7,
+ hidden_channels=[48, 48, 48, 48], # [x_4, x_8, x_16, x_32] [1/4, 1/7, 1/14, -]
+ n_gru_layers=3,
+ n_downsample=2,
+ iters=3,
+ slow_fast_gru=True,
+ num_register_tokens=4,
+ prefix='decode_heads.'),
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a12ca3f7ef521419cb67b193fd809d85fe27fca3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/__init__.py
@@ -0,0 +1,5 @@
+from .monodepth_model import DepthModel
+# from .__base_model__ import BaseDepthModel
+
+
+__all__ = ['DepthModel', 'BaseDepthModel']
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ConvNeXt.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ConvNeXt.py
new file mode 100644
index 0000000000000000000000000000000000000000..215f81727a206ebfc90048d7836c6ee7faef7441
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ConvNeXt.py
@@ -0,0 +1,271 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from custom_timm.models.layers import trunc_normal_, DropPath
+from custom_timm.models.registry import register_model
+
+class Block(nn.Module):
+ r""" ConvNeXt Block. There are two equivalent implementations:
+ (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+ (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+ We use (2) as we find it slightly faster in PyTorch
+
+ Args:
+ dim (int): Number of input channels.
+ drop_path (float): Stochastic depth rate. Default: 0.0
+ layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+ """
+ def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+ super().__init__()
+ self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+ self.norm = LayerNorm(dim, eps=1e-6)
+ self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+ self.act = nn.GELU()
+ self.pwconv2 = nn.Linear(4 * dim, dim)
+ self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+ requires_grad=True) if layer_scale_init_value > 0 else None
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+ def forward(self, x):
+ input = x
+ x = self.dwconv(x)
+ x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+ x = self.norm(x)
+ x = self.pwconv1(x)
+ x = self.act(x)
+ x = self.pwconv2(x)
+ if self.gamma is not None:
+ x = self.gamma * x
+ x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+
+ x = input + self.drop_path(x)
+ return x
+
+class ConvNeXt(nn.Module):
+ r""" ConvNeXt
+ A PyTorch impl of : `A ConvNet for the 2020s` -
+ https://arxiv.org/pdf/2201.03545.pdf
+ Args:
+ in_chans (int): Number of input image channels. Default: 3
+ num_classes (int): Number of classes for classification head. Default: 1000
+ depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+ dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+ drop_path_rate (float): Stochastic depth rate. Default: 0.
+ layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+ head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+ """
+ def __init__(self, in_chans=3, num_classes=1000,
+ depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
+ layer_scale_init_value=1e-6, head_init_scale=1.,
+ **kwargs,):
+ super().__init__()
+
+ self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+ stem = nn.Sequential(
+ nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+ LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+ )
+ self.downsample_layers.append(stem)
+ for i in range(3):
+ downsample_layer = nn.Sequential(
+ LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+ nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+ )
+ self.downsample_layers.append(downsample_layer)
+
+ self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+ dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+ cur = 0
+ for i in range(4):
+ stage = nn.Sequential(
+ *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
+ layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+ )
+ self.stages.append(stage)
+ cur += depths[i]
+
+ #self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
+ #self.head = nn.Linear(dims[-1], num_classes)
+
+ self.apply(self._init_weights)
+ #self.head.weight.data.mul_(head_init_scale)
+ #self.head.bias.data.mul_(head_init_scale)
+
+ def _init_weights(self, m):
+ if isinstance(m, (nn.Conv2d, nn.Linear)):
+ trunc_normal_(m.weight, std=.02)
+ nn.init.constant_(m.bias, 0)
+
+ def forward_features(self, x):
+ features = []
+ for i in range(4):
+ x = self.downsample_layers[i](x)
+ x = self.stages[i](x)
+ features.append(x)
+ return features # global average pooling, (N, C, H, W) -> (N, C)
+
+ def forward(self, x):
+ #x = self.forward_features(x)
+ #x = self.head(x)
+ features = self.forward_features(x)
+ return features
+
+class LayerNorm(nn.Module):
+ r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+ The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+ shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+ with shape (batch_size, channels, height, width).
+ """
+ def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(normalized_shape))
+ self.bias = nn.Parameter(torch.zeros(normalized_shape))
+ self.eps = eps
+ self.data_format = data_format
+ if self.data_format not in ["channels_last", "channels_first"]:
+ raise NotImplementedError
+ self.normalized_shape = (normalized_shape, )
+
+ def forward(self, x):
+ if self.data_format == "channels_last":
+ return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+ elif self.data_format == "channels_first":
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
+
+
+model_urls = {
+ "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+ "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+ "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+ "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+ "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
+ "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
+ "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
+ "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
+ "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
+}
+
+def convnext_tiny(pretrained=True,in_22k=False, **kwargs):
+ model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+ if pretrained:
+ checkpoint = torch.load(kwargs['checkpoint'], map_location="cpu")
+ #url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k']
+ #checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+ model_dict = model.state_dict()
+ pretrained_dict = {}
+ unmatched_pretrained_dict = {}
+ for k, v in checkpoint['model'].items():
+ if k in model_dict:
+ pretrained_dict[k] = v
+ else:
+ unmatched_pretrained_dict[k] = v
+ model_dict.update(pretrained_dict)
+ model.load_state_dict(model_dict)
+ print(
+ 'Successfully loaded pretrained %d params, and %d paras are unmatched.'
+ %(len(pretrained_dict.keys()), len(unmatched_pretrained_dict.keys())))
+ print('Unmatched pretrained paras are :', unmatched_pretrained_dict.keys())
+ return model
+
+def convnext_small(pretrained=True,in_22k=False, **kwargs):
+ model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+ if pretrained:
+ checkpoint = torch.load(kwargs['checkpoint'], map_location="cpu")
+ #url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k']
+ #checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+ model_dict = model.state_dict()
+ pretrained_dict = {}
+ unmatched_pretrained_dict = {}
+ for k, v in checkpoint['model'].items():
+ if k in model_dict:
+ pretrained_dict[k] = v
+ else:
+ unmatched_pretrained_dict[k] = v
+ model_dict.update(pretrained_dict)
+ model.load_state_dict(model_dict)
+ print(
+ 'Successfully loaded pretrained %d params, and %d paras are unmatched.'
+ %(len(pretrained_dict.keys()), len(unmatched_pretrained_dict.keys())))
+ print('Unmatched pretrained paras are :', unmatched_pretrained_dict.keys())
+ return model
+
+def convnext_base(pretrained=True, in_22k=False, **kwargs):
+ model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+ if pretrained:
+ checkpoint = torch.load(kwargs['checkpoint'], map_location="cpu")
+ #url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k']
+ #checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+ model_dict = model.state_dict()
+ pretrained_dict = {}
+ unmatched_pretrained_dict = {}
+ for k, v in checkpoint['model'].items():
+ if k in model_dict:
+ pretrained_dict[k] = v
+ else:
+ unmatched_pretrained_dict[k] = v
+ model_dict.update(pretrained_dict)
+ model.load_state_dict(model_dict)
+ print(
+ 'Successfully loaded pretrained %d params, and %d paras are unmatched.'
+ %(len(pretrained_dict.keys()), len(unmatched_pretrained_dict.keys())))
+ print('Unmatched pretrained paras are :', unmatched_pretrained_dict.keys())
+ return model
+
+def convnext_large(pretrained=True, in_22k=False, **kwargs):
+ model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+ if pretrained:
+ checkpoint = torch.load(kwargs['checkpoint'], map_location="cpu")
+ #url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k']
+ #checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+ model_dict = model.state_dict()
+ pretrained_dict = {}
+ unmatched_pretrained_dict = {}
+ for k, v in checkpoint['model'].items():
+ if k in model_dict:
+ pretrained_dict[k] = v
+ else:
+ unmatched_pretrained_dict[k] = v
+ model_dict.update(pretrained_dict)
+ model.load_state_dict(model_dict)
+ print(
+ 'Successfully loaded pretrained %d params, and %d paras are unmatched.'
+ %(len(pretrained_dict.keys()), len(unmatched_pretrained_dict.keys())))
+ print('Unmatched pretrained paras are :', unmatched_pretrained_dict.keys())
+ return model
+
+def convnext_xlarge(pretrained=True, in_22k=False, **kwargs):
+ model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+ if pretrained:
+ assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True"
+ checkpoint = torch.load(kwargs['checkpoint'], map_location="cpu")
+ #url = model_urls['convnext_xlarge_22k']
+ #checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+ model_dict = model.state_dict()
+ pretrained_dict = {}
+ unmatched_pretrained_dict = {}
+ for k, v in checkpoint['model'].items():
+ if k in model_dict:
+ pretrained_dict[k] = v
+ else:
+ unmatched_pretrained_dict[k] = v
+ model_dict.update(pretrained_dict)
+ model.load_state_dict(model_dict)
+ print(
+ 'Successfully loaded pretrained %d params, and %d paras are unmatched.'
+ %(len(pretrained_dict.keys()), len(unmatched_pretrained_dict.keys())))
+ print('Unmatched pretrained paras are :', unmatched_pretrained_dict.keys())
+ return model
+
+if __name__ == '__main__':
+ import torch
+ model = convnext_base(True, in_22k=False).cuda()
+
+ rgb = torch.rand((2, 3, 256, 256)).cuda()
+ out = model(rgb)
+ print(len(out))
+ for i, ft in enumerate(out):
+ print(i, ft.shape)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ViT_DINO.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ViT_DINO.py
new file mode 100644
index 0000000000000000000000000000000000000000..909576875b8fe0c7b9c301338e277ad8d616b52f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ViT_DINO.py
@@ -0,0 +1,1489 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable, Optional, Dict, Any, List
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+#from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+logger = logging.getLogger("dinov2")
+
+class ConvBlock(nn.Module):
+ def __init__(self, channels):
+ super(ConvBlock, self).__init__()
+
+ self.act = nn.ReLU(inplace=True)
+ self.conv1 = nn.Conv2d(
+ channels,
+ channels,
+ kernel_size=3,
+ stride=1,
+ padding=1
+ )
+ self.norm1 = nn.BatchNorm2d(channels)
+ self.conv2 = nn.Conv2d(
+ channels,
+ channels,
+ kernel_size=3,
+ stride=1,
+ padding=1
+ )
+ self.norm2 = nn.BatchNorm2d(channels)
+
+ def forward(self, x):
+
+ out = self.norm1(x)
+ out = self.act(out)
+ out = self.conv1(out)
+ out = self.norm2(out)
+ out = self.act(out)
+ out = self.conv2(out)
+ return x + out
+
+def make_2tuple(x):
+ if isinstance(x, tuple):
+ assert len(x) == 2
+ return x
+
+ assert isinstance(x, int)
+ return (x, x)
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+ if drop_prob == 0.0 or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0:
+ random_tensor.div_(keep_prob)
+ output = x * random_tensor
+ return output
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+class LayerScale(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ init_values: Union[float, Tensor] = 1e-5,
+ inplace: bool = False,
+ ) -> None:
+ super().__init__()
+ self.inplace = inplace
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+ def forward(self, x: Tensor) -> Tensor:
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class PatchEmbed(nn.Module):
+ """
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+ Args:
+ img_size: Image size.
+ patch_size: Patch token size.
+ in_chans: Number of input image channels.
+ embed_dim: Number of linear projection output channels.
+ norm_layer: Normalization layer.
+ """
+
+ def __init__(
+ self,
+ img_size: Union[int, Tuple[int, int]] = 224,
+ patch_size: Union[int, Tuple[int, int]] = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ norm_layer: Optional[Callable] = None,
+ flatten_embedding: bool = True,
+ ) -> None:
+ super().__init__()
+
+ image_HW = make_2tuple(img_size)
+ patch_HW = make_2tuple(patch_size)
+ patch_grid_size = (
+ image_HW[0] // patch_HW[0],
+ image_HW[1] // patch_HW[1],
+ )
+
+ self.img_size = image_HW
+ self.patch_size = patch_HW
+ self.patches_resolution = patch_grid_size
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ self.flatten_embedding = flatten_embedding
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ def forward(self, x: Tensor) -> Tensor:
+ _, _, H, W = x.shape
+ patch_H, patch_W = self.patch_size
+
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+ x = self.proj(x) # B C H W
+ H, W = x.size(2), x.size(3)
+ x = x.flatten(2).transpose(1, 2) # B HW C
+ x = self.norm(x)
+ if not self.flatten_embedding:
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
+ return x
+
+ def flops(self) -> float:
+ Ho, Wo = self.patches_resolution
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+ if self.norm is not None:
+ flops += Ho * Wo * self.embed_dim
+ return flops
+
+class Mlp(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x: Tensor) -> Tensor:
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class SwiGLUFFN(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+ def forward(self, x: Tensor) -> Tensor:
+ x12 = self.w12(x)
+ x1, x2 = x12.chunk(2, dim=-1)
+ hidden = F.silu(x1) * x2
+ return self.w3(hidden)
+
+
+try:
+ from xformers.ops import SwiGLU
+ #import numpy.bool
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ SwiGLU = SwiGLUFFN
+ XFORMERS_AVAILABLE = False
+
+class SwiGLUFFNFused(SwiGLU):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+ super().__init__(
+ in_features=in_features,
+ hidden_features=hidden_features,
+ out_features=out_features,
+ bias=bias,
+ )
+
+
+XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ attn_drop: float = 0.0,
+ proj_drop: float = 0.0,
+ window_size: int = 0,
+ ) -> None:
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ #if not self.training:
+ #
+ # self.attn = ScaledDotProduct()
+ #self.attn = MultiHeadDispatch(dim_model=EMB, residual_dropout=DROPOUT, num_heads=HEADS, attention=attn)
+
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+ attn = q @ k.transpose(-2, -1)
+
+ if attn_bias is not None:
+ attn = attn + attn_bias[:, :, :N]
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class MemEffAttention(Attention):
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ if not XFORMERS_AVAILABLE:
+ #if True:
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
+ return super().forward(x, attn_bias)
+
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+ q, k, v = unbind(qkv, 2)
+ if attn_bias is not None:
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias[:, :, :N])
+ else:
+ x = memory_efficient_attention(q, k, v)
+ x = x.reshape([B, N, C])
+
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+XFORMERS_AVAILABLE = False
+
+class Block(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ ffn_bias: bool = True,
+ drop: float = 0.0,
+ attn_drop: float = 0.0,
+ init_values = None,
+ drop_path: float = 0.0,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+ attn_class: Callable[..., nn.Module] = Attention,
+ ffn_layer: Callable[..., nn.Module] = Mlp,
+ ) -> None:
+ super().__init__()
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+ self.norm1 = norm_layer(dim)
+ self.attn = attn_class(
+ dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ attn_drop=attn_drop,
+ proj_drop=drop,
+ )
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = ffn_layer(
+ in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop,
+ bias=ffn_bias,
+ )
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.sample_drop_ratio = drop_path
+
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ def attn_residual_func(x: Tensor, attn_bias) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x), attn_bias))
+
+ def ffn_residual_func(x: Tensor) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ if self.training and self.sample_drop_ratio > 0.1:
+ # the overhead is compensated only for a drop path rate larger than 0.1
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ attn_bias=attn_bias
+ )
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ )
+ elif self.training and self.sample_drop_ratio > 0.0:
+ x = x + self.drop_path1(attn_residual_func(x, attn_bias))
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
+ else:
+ x = x + attn_residual_func(x, attn_bias)
+ x = x + ffn_residual_func(x)
+ return x
+
+
+def drop_add_residual_stochastic_depth(
+ x: Tensor,
+ residual_func: Callable[[Tensor], Tensor],
+ sample_drop_ratio: float = 0.0, attn_bias=None
+) -> Tensor:
+ # 1) extract subset using permutation
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ x_subset = x[brange]
+
+ # 2) apply residual_func to get residual
+ residual = residual_func(x_subset, attn_bias)
+
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+
+ residual_scale_factor = b / sample_subset_size
+
+ # 3) add the residual
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ residual_scale_factor = b / sample_subset_size
+ return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+ if scaling_vector is None:
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ else:
+ x_plus_residual = scaled_index_add(
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+ )
+ return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+ """
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
+ """
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+ if all_shapes not in attn_bias_cache.keys():
+ seqlens = []
+ for b, x in zip(batch_sizes, x_list):
+ for _ in range(b):
+ seqlens.append(x.shape[1])
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+ attn_bias._batch_sizes = batch_sizes
+ attn_bias_cache[all_shapes] = attn_bias
+
+ if branges is not None:
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+ else:
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+ return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+ x_list: List[Tensor],
+ residual_func: Callable[[Tensor, Any], Tensor],
+ sample_drop_ratio: float = 0.0,
+ scaling_vector=None,
+) -> Tensor:
+ # 1) generate random set of indices for dropping samples in the batch
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+ branges = [s[0] for s in branges_scales]
+ residual_scale_factors = [s[1] for s in branges_scales]
+
+ # 2) get attention bias and index+concat the tensors
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+ # 3) apply residual_func to get residual, and split the result
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
+
+ outputs = []
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+ return outputs
+
+
+class NestedTensorBlock(Block):
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+ """
+ x_list contains a list of tensors to nest together and run
+ """
+ assert isinstance(self.attn, MemEffAttention)
+
+ if self.training and self.sample_drop_ratio > 0.0:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.mlp(self.norm2(x))
+
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ return x_list
+ else:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ attn_bias, x = get_attn_bias_and_cat(x_list)
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
+ x = x + ffn_residual_func(x)
+ return attn_bias.split(x)
+
+ def forward(self, x_or_x_list, attn_bias=None):
+ if isinstance(x_or_x_list, Tensor):
+ return super().forward(x_or_x_list, attn_bias)
+ elif isinstance(x_or_x_list, list):
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+ return self.forward_nested(x_or_x_list)
+ else:
+ raise AssertionError
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+ if not depth_first and include_root:
+ fn(module=module, name=name)
+ for child_name, child_module in module.named_children():
+ child_name = ".".join((name, child_name)) if name else child_name
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+ if depth_first and include_root:
+ fn(module=module, name=name)
+ return module
+
+
+class BlockChunk(nn.ModuleList):
+ def forward(self, x, others=None):
+ for b in self:
+ if others == None:
+ x = b(x)
+ else:
+ x = b(x, others)
+ return x
+
+
+class DinoVisionTransformer(nn.Module):
+ def __init__(
+ self,
+ img_size=224,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ ffn_bias=True,
+ proj_bias=True,
+ drop_path_rate=0.0,
+ drop_path_uniform=False,
+ #init_values=None, # for layerscale: None or 0 => no layerscale
+ init_values=1e-5, # for layerscale: None or 0 => no layerscale
+ embed_layer=PatchEmbed,
+ act_layer=nn.GELU,
+ block_fn=NestedTensorBlock,
+ ffn_layer="mlp",
+ block_chunks=1,
+ window_size=37,
+ **kwargs
+ ):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ proj_bias (bool): enable bias for proj in attn if True
+ ffn_bias (bool): enable bias for ffn if True
+ drop_path_rate (float): stochastic depth rate
+ drop_path_uniform (bool): apply uniform drop rate across blocks
+ weight_init (str): weight init scheme
+ init_values (float): layer-scale init values
+ embed_layer (nn.Module): patch embedding layer
+ act_layer (nn.Module): MLP activation layer
+ block_fn (nn.Module): transformer block class
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+ """
+ super().__init__()
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 1
+ self.n_blocks = depth
+ self.num_heads = num_heads
+ self.patch_size = patch_size
+ self.window_size = window_size
+
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+
+ if drop_path_uniform is True:
+ dpr = [drop_path_rate] * depth
+ else:
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+
+ if ffn_layer == "mlp":
+ logger.info("using MLP layer as FFN")
+ ffn_layer = Mlp
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+ logger.info("using SwiGLU layer as FFN")
+ ffn_layer = SwiGLUFFNFused
+ elif ffn_layer == "identity":
+ logger.info("using Identity layer as FFN")
+
+ def f(*args, **kwargs):
+ return nn.Identity()
+
+ ffn_layer = f
+ else:
+ raise NotImplementedError
+
+ blocks_list = [
+ block_fn(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ ffn_bias=ffn_bias,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ ffn_layer=ffn_layer,
+ init_values=init_values,
+ )
+ for i in range(depth)
+ ]
+ if block_chunks > 0:
+ self.chunked_blocks = True
+ chunked_blocks = []
+ chunksize = depth // block_chunks
+ for i in range(0, depth, chunksize):
+ # this is to keep the block index consistent if we chunk the block list
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+ else:
+ self.chunked_blocks = False
+ self.blocks = nn.ModuleList(blocks_list)
+
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Identity()
+
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+ self.init_weights()
+
+ def init_weights(self):
+ trunc_normal_(self.pos_embed, std=0.02)
+ nn.init.normal_(self.cls_token, std=1e-6)
+ named_apply(init_weights_vit_timm, self)
+
+ def interpolate_pos_encoding(self, x, w, h):
+ previous_dtype = x.dtype
+ npatch = x.shape[1] - 1
+ N = self.pos_embed.shape[1] - 1
+ if npatch == N and w == h:
+ return self.pos_embed
+ pos_embed = self.pos_embed.float()
+ class_pos_embed = pos_embed[:, 0]
+ patch_pos_embed = pos_embed[:, 1:]
+ dim = x.shape[-1]
+ w0 = w // self.patch_size
+ h0 = h // self.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ w0, h0 = w0 + 0.1, h0 + 0.1
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+ scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+ mode="bicubic",
+ )
+
+ assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+ def prepare_tokens_with_masks(self, x, masks=None):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x)
+ if masks is not None:
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+ x = x + self.interpolate_pos_encoding(x, w, h)
+
+ return x
+
+ def forward_features_list(self, x_list, masks_list):
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+ for blk in self.blocks:
+ x = blk(x)
+
+ all_x = x
+ output = []
+ for x, masks in zip(all_x, masks_list):
+ x_norm = self.norm(x)
+ output.append(
+ {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_patchtokens": x_norm[:, 1:],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+ )
+ return output
+
+ def forward_features(self, x, masks=None):
+ if isinstance(x, list):
+ return self.forward_features_list(x, masks)
+
+ B, C, H, W = x.size()
+ pad_h = (self.patch_size - H % self.patch_size)
+ pad_w = (self.patch_size - W % self.patch_size)
+ if pad_h == self.patch_size:
+ pad_h = 0
+ if pad_w == self.patch_size:
+ pad_w = 0
+ #x = nn.functional.pad(x, (pad_h//2, pad_h-pad_h//2, pad_w//2, pad_w-pad_w//2))
+ if pad_h + pad_w > 0:
+ x = torch.nn.functional.interpolate(x, (H+pad_h, W+pad_w), mode='bilinear')
+
+ x = self.prepare_tokens_with_masks(x, masks)
+
+ features = []
+ for blk in self.blocks:
+ x = blk(x)
+ # for idx in range(len(self.blocks[0])):
+ # x = self.blocks[0][idx](x)
+ # if (idx + 1) % (len(self.blocks[0]) // 4) == 0:
+ # features.append(x)
+
+ #return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W)]
+
+ x_norm = self.norm(x)
+ # return {
+ # "x_norm_clstoken": x_norm[:, 0],
+ # "x_norm_patchtokens": x_norm[:, 1:],
+ # "x_prenorm": x,
+ # "masks": masks,
+ # }
+ features = []
+ features.append(x_norm)
+ features.append(x_norm)
+ features.append(x_norm)
+ features.append(x_norm)
+ return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W)]
+
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ # If n is an int, take the n last blocks. If it's a list, take them
+ output, total_block_len = [], len(self.blocks)
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def _get_intermediate_layers_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
+ # If n is an int, take the n last blocks. If it's a list, take them
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for block_chunk in self.blocks:
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ i += 1
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def get_intermediate_layers(
+ self,
+ x: torch.Tensor,
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
+ reshape: bool = False,
+ return_class_token: bool = False,
+ norm=True,
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+ if self.chunked_blocks:
+ outputs = self._get_intermediate_layers_chunked(x, n)
+ else:
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
+ if norm:
+ outputs = [self.norm(out) for out in outputs]
+ class_tokens = [out[:, 0] for out in outputs]
+ outputs = [out[:, 1:] for out in outputs]
+ if reshape:
+ B, _, w, h = x.shape
+ outputs = [
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+ for out in outputs
+ ]
+ if return_class_token:
+ return tuple(zip(outputs, class_tokens))
+ return tuple(outputs)
+
+ def forward(self, *args, is_training=False, **kwargs):
+ ret = self.forward_features(*args, **kwargs)
+ return ret
+ # if is_training:
+ # return ret
+ # else:
+ # return self.head(ret["x_norm_clstoken"])
+
+
+class PosConv(nn.Module):
+ # PEG from https://arxiv.org/abs/2102.10882
+ def __init__(self, in_chans, embed_dim=768, stride=1):
+ super(PosConv, self).__init__()
+ self.proj = nn.Sequential(
+ nn.Conv2d(in_chans, embed_dim, 37, stride, 18, bias=True, groups=embed_dim),
+ )
+ self.stride = stride
+
+ def forward(self, x, size):
+ B, N, C = x.shape
+ cnn_feat_token = x.transpose(1, 2).view(B, C, *size)
+ x = self.proj(cnn_feat_token)
+ if self.stride == 1:
+ x += cnn_feat_token
+ x = x.flatten(2).transpose(1, 2)
+ return x
+
+ #def no_weight_decay(self):
+ #return ['proj.%d.weight' % i for i in range(4)]
+
+class DinoWindowVisionTransformer(nn.Module):
+ def __init__(
+ self,
+ img_size=224,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ ffn_bias=True,
+ proj_bias=True,
+ drop_path_rate=0.0,
+ drop_path_uniform=False,
+ #init_values=None, # for layerscale: None or 0 => no layerscale
+ init_values=1e-5, # for layerscale: None or 0 => no layerscale
+ embed_layer=PatchEmbed,
+ act_layer=nn.GELU,
+ block_fn=NestedTensorBlock,
+ ffn_layer="mlp",
+ block_chunks=1,
+ window_size=7,
+ **kwargs
+ ):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ proj_bias (bool): enable bias for proj in attn if True
+ ffn_bias (bool): enable bias for ffn if True
+ drop_path_rate (float): stochastic depth rate
+ drop_path_uniform (bool): apply uniform drop rate across blocks
+ weight_init (str): weight init scheme
+ init_values (float): layer-scale init values
+ embed_layer (nn.Module): patch embedding layer
+ act_layer (nn.Module): MLP activation layer
+ block_fn (nn.Module): transformer block class
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+ """
+ super().__init__()
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 1
+ self.n_blocks = depth
+ self.num_heads = num_heads
+ self.patch_size = patch_size
+
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ #self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ #self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+
+ self.pos_conv = PosConv(self.embed_dim, self.embed_dim)
+
+ self.window_size = window_size
+ #self.conv_block = nn.ModuleList([ConvBlock(embed_dim) for i in range(4)])
+ #self.conv_block = nn.ModuleList([nn.Identity() for i in range(4)])
+
+ if drop_path_uniform is True:
+ dpr = [drop_path_rate] * depth
+ else:
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+
+ if ffn_layer == "mlp":
+ logger.info("using MLP layer as FFN")
+ ffn_layer = Mlp
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+ logger.info("using SwiGLU layer as FFN")
+ ffn_layer = SwiGLUFFNFused
+ elif ffn_layer == "identity":
+ logger.info("using Identity layer as FFN")
+
+ def f(*args, **kwargs):
+ return nn.Identity()
+
+ ffn_layer = f
+ else:
+ raise NotImplementedError
+
+ blocks_list = [
+ block_fn(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ ffn_bias=ffn_bias,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ ffn_layer=ffn_layer,
+ init_values=init_values,
+ )
+ for i in range(depth)
+ ]
+ if block_chunks > 0:
+ self.chunked_blocks = True
+ chunked_blocks = []
+ chunksize = depth // block_chunks
+ for i in range(0, depth, chunksize):
+ # this is to keep the block index consistent if we chunk the block list
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+ else:
+ self.chunked_blocks = False
+ self.blocks = nn.ModuleList(blocks_list)
+
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Identity()
+
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+ self.nh = -1
+ self.nw = -1
+ try:
+ H = cfg.data_basic['crop_size'][0]
+ W = cfg.data_basic['crop_size'][1]
+ pad_h = (self.patch_size - H % self.patch_size)
+ pad_w = (self.patch_size - W % self.patch_size)
+ if pad_h == self.patch_size:
+ pad_h = 0
+ if pad_w == self.patch_size:
+ pad_w = 0
+ self.nh = (H + pad_h) // self.patch_size
+ self.nw = (W + pad_w) // self.patch_size
+ self.prepare_attn_bias((self.nh, self.nw))
+ except:
+ pass
+ self.init_weights()
+
+ self.total_step = 10000 # For PE -> GPE transfer
+ self.start_step = 2000
+ self.current_step = 20000
+
+ def init_weights(self):
+ #trunc_normal_(self.pos_embed, std=0.02)
+ #nn.init.normal_(self.cls_token, std=1e-6)
+ named_apply(init_weights_vit_timm, self)
+ for i in range(4):
+ try:
+ nn.init.constant_(self.conv_block[i].conv2.weight, 0.0)
+ except:
+ pass
+
+ def interpolate_pos_encoding(self, x, w, h):
+ previous_dtype = x.dtype
+ #npatch = x.shape[1] - 1
+ #N = self.pos_embed.shape[1] - 1
+ npatch = x.shape[1]
+ N = self.pos_embed.shape[1]
+ if npatch == N and w == h:
+ return self.pos_embed
+ pos_embed = self.pos_embed.float()
+ #class_pos_embed = pos_embed[:, 0]
+ #patch_pos_embed = pos_embed[:, 1:]
+ patch_pos_embed = pos_embed
+ dim = x.shape[-1]
+ w0 = w // self.patch_size
+ h0 = h // self.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ w0, h0 = w0 + 0.1, h0 + 0.1
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+ scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+ mode="bicubic",
+ )
+
+ assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return patch_pos_embed.to(previous_dtype)
+ #return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+ def window_partition(self, x: torch.Tensor, window_size: int, hw: Tuple[int, int], conv_feature=False) -> Tuple[torch.Tensor, Tuple[int, int]]:
+ """
+ Partition into non-overlapping windows with padding if needed.
+ Args:
+ x (tensor): input tokens with [B, H, W, C].
+ window_size (int): window size.
+
+ Returns:
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
+ (Hp, Wp): padded height and width before partition
+ """
+ if conv_feature == False:
+ B, N, C = x.shape
+ H, W = hw[0], hw[1]
+
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size * window_size, C)
+ else:
+ B, C, H, W = x.shape
+
+ x = x.view(B, C, H // window_size, window_size, W // window_size, window_size)
+
+ windows = x.permute(0, 2, 4, 3, 5, 1).contiguous().view(-1, window_size * window_size, C)
+
+ #y = torch.cat((x_cls, windows), dim=1)
+ return windows #, (Hp, Wp)
+
+
+ def window_unpartition(self,
+ windows: torch.Tensor, window_size: int, hw: Tuple[int, int], conv_feature=False
+ ) -> torch.Tensor:
+ """
+ Window unpartition into original sequences and removing padding.
+ Args:
+ windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+ window_size (int): window size.
+ pad_hw (Tuple): padded height and width (Hp, Wp).
+ hw (Tuple): original height and width (H, W) before padding.
+
+ Returns:
+ x: unpartitioned sequences with [B, H, W, C].
+ """
+ H, W = hw
+
+ B = windows.shape[0] // (H * W // window_size // window_size)
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+
+ if conv_feature == False:
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp * Wp, -1)
+ else:
+ C = windows.shape[-1]
+ x = x.permute(0, 5, 1, 3, 2, 4).contiguous().view(B, C, H, W)
+
+ # if Hp > H or Wp > W:
+ # x = x[:, :H, :W, :].contiguous()
+ return x
+
+ def prepare_tokens_with_masks(self, x, masks=None, step=-1):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x)
+ if masks is not None:
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+ #x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+ if step == -1:
+ step = self.current_step
+ else:
+ self.current_step = step
+
+ if step < self.start_step:
+ coef = 0.0
+ elif step < self.total_step:
+ coef = (step - self.start_step) / (self.total_step - self.start_step)
+ else:
+ coef = 1.0
+
+ x = x + (1 - coef) * self.interpolate_pos_encoding(x, w, h) + coef * self.pos_conv(x, (self.nh, self.nw))
+
+ return x
+
+ def prepare_attn_bias(self, shape):
+ window_size = self.window_size
+ if window_size <= 0:
+ return
+
+ import xformers.components.attention.attention_patterns as AP
+
+ nh, nw = shape
+ radius = (window_size-1)//2
+ mask_ori = AP.local_2d_pattern(nh, nw, distance = radius + 0.1, p=torch.inf).cuda()
+
+ pad = (8 - (nh * nw) % 8)
+ if pad == 8:
+ pad = 0
+ mask_pad = nn.functional.pad(mask_ori, (0, pad)).contiguous()
+ if pad > 0:
+ mask = mask_pad[:, :-pad].view(nh, nw, nh, nw)
+ else:
+ mask = mask_pad[:, :].view(nh, nw, nh, nw)
+
+ # angle
+ mask[:radius+1, :radius+1, :window_size, :window_size] = True
+ mask[:radius+1, -radius-1:, :window_size, -window_size:] = True
+ mask[-radius-1:, :radius+1, -window_size:, :window_size] = True
+ mask[-radius-1:, -radius-1:, -window_size:, -window_size:] = True
+
+ # edge
+ mask[radius+1:-radius-1, :radius+1, :, :] = mask[radius+1:-radius-1, radius:radius+1, :, :]
+ mask[radius+1:-radius-1, -radius-1:, :, :] = mask[radius+1:-radius-1, -radius-1:-radius, :, :]
+ mask[:radius+1, radius+1:-radius-1, :, :] = mask[radius:radius+1, radius+1:-radius-1, :, :]
+ mask[-radius-1:, radius+1:-radius-1, :, :] = mask[-radius-1:-radius, radius+1:-radius-1, :, :]
+
+ mask = mask.view(nh*nw, nh*nw)
+ bias_pad = torch.log(mask_pad)
+ #bias = bias_pad[:, :-pad]
+ self.register_buffer('attn_bias', bias_pad)
+
+ return bias_pad
+
+ def forward_features_list(self, x_list, masks_list):
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+ for blk in self.blocks:
+ x = blk(x)
+
+ all_x = x
+ output = []
+ for x, masks in zip(all_x, masks_list):
+ x_norm = self.norm(x)
+ output.append(
+ {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_patchtokens": x_norm[:, 1:],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+ )
+ return output
+
+ def forward_features(self, x, masks=None, **kwargs):
+ if isinstance(x, list):
+ return self.forward_features_list(x, masks)
+
+ B, C, H, W = x.size()
+ pad_h = (self.patch_size - H % self.patch_size)
+ pad_w = (self.patch_size - W % self.patch_size)
+ if pad_h == self.patch_size:
+ pad_h = 0
+ if pad_w == self.patch_size:
+ pad_w = 0
+ #x = nn.functional.pad(x, (pad_h//2, pad_h-pad_h//2, pad_w//2, pad_w-pad_w//2))
+ if pad_h + pad_w > 0:
+ x = torch.nn.functional.interpolate(x, (H+pad_h, W+pad_w), mode='bilinear')
+
+ nh = (H+pad_h)//self.patch_size
+ nw = (W+pad_w)//self.patch_size
+
+ if self.window_size > 0:
+ if nh == self.nh and nw == self.nw:
+ attn_bias = self.attn_bias
+ else:
+ attn_bias = self.prepare_attn_bias(((H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size))
+ self.nh = nh
+ self.nw = nw
+ attn_bias = attn_bias.unsqueeze(0).repeat(B * self.num_heads, 1, 1)
+ else:
+ attn_bias = None
+
+ x = self.prepare_tokens_with_masks(x, masks)
+ #x = self.patch_embed(x)
+
+ features = []
+ #x = self.window_partition(x, self.window_size, (H // self.patch_size, W // self.patch_size))
+ for blk in self.blocks:
+ x = blk(x, attn_bias)
+ #x = self.window_unpartition(x, self.window_size, (H // self.patch_size, W // self.patch_size))
+
+ # for idx in range(len(self.blocks[0])):
+ # x = self.blocks[0][idx](x, attn_bias)
+
+ # if (idx + 1) % (len(self.blocks[0]) // 4) == 0:
+ # x = self.window_unpartition(x, self.window_size, (H // self.patch_size, W // self.patch_size), conv_feature=True)
+ # x = self.conv_block[idx // (len(self.blocks[0]) // 4)](x)
+ # if idx + 1 != len(self.blocks[0]):
+ # x = self.window_partition(x, self.window_size, (H // self.patch_size, W // self.patch_size), conv_feature=True)
+ # else:
+ # b, c, h, w = x.size()
+ # x = x.permute(0, 2, 3, 1).contiguous().view(b, h, w, c)
+ #features.append(x)
+
+ #return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W)]
+
+ x_norm = self.norm(x)
+ # return {
+ # "x_norm_clstoken": x_norm[:, 0],
+ # "x_norm_patchtokens": x_norm[:, 1:],
+ # "x_prenorm": x,
+ # "masks": masks,
+ # }
+ features = []
+ features.append(x_norm)
+ features.append(x_norm)
+ features.append(x_norm)
+ features.append(x_norm)
+ return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W)]
+
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ # If n is an int, take the n last blocks. If it's a list, take them
+ output, total_block_len = [], len(self.blocks)
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def _get_intermediate_layers_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
+ # If n is an int, take the n last blocks. If it's a list, take them
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for block_chunk in self.blocks:
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ i += 1
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def get_intermediate_layers(
+ self,
+ x: torch.Tensor,
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
+ reshape: bool = False,
+ return_class_token: bool = False,
+ norm=True,
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+ if self.chunked_blocks:
+ outputs = self._get_intermediate_layers_chunked(x, n)
+ else:
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
+ if norm:
+ outputs = [self.norm(out) for out in outputs]
+ class_tokens = [out[:, 0] for out in outputs]
+ outputs = [out[:, 1:] for out in outputs]
+ if reshape:
+ B, _, w, h = x.shape
+ outputs = [
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+ for out in outputs
+ ]
+ if return_class_token:
+ return tuple(zip(outputs, class_tokens))
+ return tuple(outputs)
+
+ def forward(self, *args, is_training=False, **kwargs):
+ ret = self.forward_features(*args, **kwargs)
+ return ret
+ # if is_training:
+ # return ret
+ # else:
+ # return self.head(ret["x_norm_clstoken"])
+
+
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+ """ViT weight initialization, original timm impl (for reproducibility)"""
+ if isinstance(module, nn.Linear):
+ trunc_normal_(module.weight, std=0.02)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=14, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ block_fn=partial(NestedTensorBlock, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
+
+
+def vit_base(patch_size=14, **kwargs):
+ model = DinoWindowVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ block_fn=partial(NestedTensorBlock, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
+
+
+def vit_large(patch_size=14, checkpoint=None, **kwargs):
+ model = DinoVisionTransformer(
+ img_size = 518,
+ patch_size=patch_size,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ block_fn=partial(NestedTensorBlock, attn_class=MemEffAttention),
+ **kwargs,
+ )
+
+ if checkpoint is not None:
+ with open(checkpoint, "rb") as f:
+ state_dict = torch.load(f)
+ try:
+ model.load_state_dict(state_dict, strict=True)
+ except:
+ new_state_dict = {}
+ for key, value in state_dict.items():
+ if 'blocks' in key:
+ key_new = 'blocks.0' + key[len('blocks'):]
+ else:
+ key_new = key
+ new_state_dict[key_new] = value
+
+ model.load_state_dict(new_state_dict, strict=True)
+ #del model.norm
+ del model.mask_token
+ return model
+
+ # model = DinoWindowVisionTransformer(
+ # img_size = 518,
+ # patch_size=patch_size,
+ # embed_dim=1024,
+ # depth=24,
+ # num_heads=16,
+ # mlp_ratio=4,
+ # block_fn=partial(NestedTensorBlock, attn_class=MemEffAttention),
+ # window_size=37,
+ # **kwargs,
+ # )
+
+ # if checkpoint is not None:
+ # with open(checkpoint, "rb") as f:
+ # state_dict = torch.load(f)
+ # try:
+ # model.load_state_dict(state_dict, strict=True)
+ # except:
+ # new_state_dict = {}
+ # for key, value in state_dict.items():
+ # if 'blocks' in key:
+ # key_new = 'blocks.0' + key[len('blocks'):]
+ # else:
+ # key_new = key
+ # if 'pos_embed' in key:
+ # value = value[:, 1:, :]
+ # new_state_dict[key_new] = value
+
+ # model.load_state_dict(new_state_dict, strict=False)
+ # #del model.norm
+ # del model.mask_token
+ return model
+
+
+def vit_giant2(patch_size=16, **kwargs):
+ """
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+ """
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1536,
+ depth=40,
+ num_heads=24,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ **kwargs,
+ )
+ return model
+
+if __name__ == '__main__':
+ try:
+ from custom_mmpkg.custom_mmcv.utils import Config
+ except:
+ from mmengine import Config
+
+ #rgb = torch.rand((2, 3, 518, 518)).cuda()
+
+ #cfg.data_basic['crop_size']['0']
+ #cfg.data_basic['crop_size']['1']
+ cfg = Config.fromfile('mu.hu/monodepth/mono/configs/HourglassDecoder/pub12.convlarge.0.3_150.py')
+
+ #rgb = torch.arange(0, 2*3*1036*1036, 1).cuda().float().view(2, 3, 1036, 1036)
+ rgb = torch.zeros(1, 3, 1400, 1680).cuda()
+ model = vit_large(checkpoint="pretrained_weight_repo/vit/dinov2_vitl14_pretrain.pth", kwarg=cfg).cuda()
+
+ #import timm
+ #model2 = timm.models.vision_transformer.vit_large_patch14_dinov2().cuda()
+ #timm.models.load_checkpoint(model2, '/cpfs02/shared/public/yvan/pretrained_weight_repo/vit/dinov2_vitl14_pretrain.pth', filter_fn=timm.models.vision_transformer.checkpoint_filter_fn)
+
+ out1 = model(rgb)
+ #out2 = model2(rgb)
+ temp = 0
+
+
+
+# import time
+# window_size = 37
+# def prepare_window_masks(shape):
+# if window_size <= 0:
+# return None
+# import xformers.components.attention.attention_patterns as AP
+
+# B, nh, nw, _, _ = shape
+# radius = (window_size-1)//2
+# #time0 = time.time()
+# d = AP.local_nd_distance(nh, nw, distance = radius + 0.1, p=torch.inf).cuda()
+# #mask = AP.local_2d_pattern(nh, nw, distance = radius + 0.1, p=torch.inf).cuda()
+# # mask = mask.view(nh, nw, nh, nw)
+# # #time1 = time.time() - time0
+
+# # # angle
+# # mask[:radius+1, :radius+1, :window_size, :window_size] = True
+# # mask[:radius+1, -radius-1:, :window_size, -window_size:] = True
+# # mask[-radius-1:, :radius+1, -window_size:, :window_size] = True
+# # mask[-radius-1:, -radius-1:, -window_size:, -window_size:] = True
+# # time2 = time.time() - time0 - time1
+
+# # # edge
+# # mask[radius+1:-radius-1, :radius+1, :, :] = mask[radius+1:-radius-1, radius:radius+1, :, :]
+# # mask[radius+1:-radius-1, -radius-1:, :, :] = mask[radius+1:-radius-1, -radius-1:-radius, :, :]
+# # mask[:radius+1, radius+1:-radius-1, :, :] = mask[radius:radius+1, radius+1:-radius-1, :, :]
+# # mask[-radius-1:, radius+1:-radius-1, :, :] = mask[-radius-1:-radius, radius+1:-radius-1, :, :]
+# # time3 = time.time() - time0 - time2
+# # print(time1, time2, time3)
+
+# # return mask.view(nw*nw, nh*nw).unsqueeze(0).repeat(B, 1)
+
+# shape = (1, 55, 55, None, None)
+# mask = prepare_window_masks(shape)
+# # temp = 1
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ViT_DINO_reg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ViT_DINO_reg.py
new file mode 100644
index 0000000000000000000000000000000000000000..001b2a141d203a1998579523a00b5323bf31a321
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/ViT_DINO_reg.py
@@ -0,0 +1,1303 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable, Optional, Dict, Any, List
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+import torch.nn.init
+import torch.nn.functional as F
+
+#from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+logger = logging.getLogger("dinov2")
+
+# SSF finetuning originally by dongzelian
+def init_ssf_scale_shift(dim):
+ scale = nn.Parameter(torch.ones(dim))
+ shift = nn.Parameter(torch.zeros(dim))
+
+ nn.init.normal_(scale, mean=1, std=.02)
+ nn.init.normal_(shift, std=.02)
+
+ return scale, shift
+
+def ssf_ada(x, scale, shift):
+ assert scale.shape == shift.shape
+ if x.shape[-1] == scale.shape[0]:
+ return x * scale + shift
+ elif x.shape[1] == scale.shape[0]:
+ return x * scale.view(1, -1, 1, 1) + shift.view(1, -1, 1, 1)
+ else:
+ raise ValueError('the input tensor shape does not match the shape of the scale factor.')
+
+# LoRA finetuning originally by edwardjhu
+class LoRALayer():
+ def __init__(
+ self,
+ r: int,
+ lora_alpha: int,
+ lora_dropout: float,
+ merge_weights: bool,
+ ):
+ self.r = r
+ self.lora_alpha = lora_alpha
+ # Optional dropout
+ if lora_dropout > 0.:
+ self.lora_dropout = nn.Dropout(p=lora_dropout)
+ else:
+ self.lora_dropout = lambda x: x
+ # Mark the weight as unmerged
+ self.merged = False
+ self.merge_weights = merge_weights
+
+class LoRALinear(nn.Linear, LoRALayer):
+ # LoRA implemented in a dense layer
+ def __init__(
+ self,
+ in_features: int,
+ out_features: int,
+ r: int = 0,
+ lora_alpha: int = 1,
+ lora_dropout: float = 0.,
+ fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+ merge_weights: bool = True,
+ **kwargs
+ ):
+ nn.Linear.__init__(self, in_features, out_features, **kwargs)
+ LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
+ merge_weights=merge_weights)
+
+ self.fan_in_fan_out = fan_in_fan_out
+ # Actual trainable parameters
+ if r > 0:
+ self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
+ self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
+ self.scaling = self.lora_alpha / self.r
+ # Freezing the pre-trained weight matrix
+ self.weight.requires_grad = False
+ self.reset_parameters()
+ if fan_in_fan_out:
+ self.weight.data = self.weight.data.transpose(0, 1)
+
+ def reset_parameters(self):
+ #nn.Linear.reset_parameters(self)
+ if hasattr(self, 'lora_A'):
+ # initialize B the same way as the default for nn.Linear and A to zero
+ # this is different than what is described in the paper but should not affect performance
+ nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+ nn.init.zeros_(self.lora_B)
+
+ # def train(self, mode: bool = True):
+ # def T(w):
+ # return w.transpose(0, 1) if self.fan_in_fan_out else w
+ # nn.Linear.train(self, mode)
+ # if mode:
+ # if self.merge_weights and self.merged:
+ # # Make sure that the weights are not merged
+ # if self.r > 0:
+ # self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+ # self.merged = False
+ # else:
+ # if self.merge_weights and not self.merged:
+ # # Merge the weights and mark it
+ # if self.r > 0:
+ # self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+ # self.merged = True
+
+ def forward(self, x: torch.Tensor):
+ def T(w):
+ return w.transpose(0, 1) if self.fan_in_fan_out else w
+ if self.r > 0 and not self.merged:
+ result = F.linear(x, T(self.weight), bias=self.bias)
+ result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
+ return result
+ else:
+ return F.linear(x, T(self.weight), bias=self.bias)
+
+
+
+def make_2tuple(x):
+ if isinstance(x, tuple):
+ assert len(x) == 2
+ return x
+
+ assert isinstance(x, int)
+ return (x, x)
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+ if drop_prob == 0.0 or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0:
+ random_tensor.div_(keep_prob)
+ output = x * random_tensor
+ return output
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+class LayerScale(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ init_values: Union[float, Tensor] = 1e-5,
+ inplace: bool = False,
+ ) -> None:
+ super().__init__()
+ self.inplace = inplace
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+ def forward(self, x: Tensor) -> Tensor:
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class PatchEmbed(nn.Module):
+ """
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+ Args:
+ img_size: Image size.
+ patch_size: Patch token size.
+ in_chans: Number of input image channels.
+ embed_dim: Number of linear projection output channels.
+ norm_layer: Normalization layer.
+ """
+
+ def __init__(
+ self,
+ img_size: Union[int, Tuple[int, int]] = 224,
+ patch_size: Union[int, Tuple[int, int]] = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ norm_layer: Optional[Callable] = None,
+ flatten_embedding: bool = True,
+ tuning_mode: Optional[str] = None
+ ) -> None:
+ super().__init__()
+
+ image_HW = make_2tuple(img_size)
+ patch_HW = make_2tuple(patch_size)
+ patch_grid_size = (
+ image_HW[0] // patch_HW[0],
+ image_HW[1] // patch_HW[1],
+ )
+
+ self.img_size = image_HW
+ self.patch_size = patch_HW
+ self.patches_resolution = patch_grid_size
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ self.flatten_embedding = flatten_embedding
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ if tuning_mode != None:
+ self.tuning_mode = tuning_mode
+ if tuning_mode == 'ssf':
+ self.ssf_scale_1, self.ssf_shift_1 = init_ssf_scale_shift(embed_dim)
+ else:
+ pass
+ #raise NotImplementedError()
+ else:
+ self.tuning_mode = None
+
+ def forward(self, x: Tensor) -> Tensor:
+ _, _, H, W = x.shape
+ patch_H, patch_W = self.patch_size
+
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+ x = self.proj(x) # B C H W
+ H, W = x.size(2), x.size(3)
+ x = x.flatten(2).transpose(1, 2) # B HW C
+ x = self.norm(x)
+ if self.tuning_mode == 'ssf':
+ x = ssf_ada(x, self.ssf_scale_1, self.ssf_shift_1)
+ if not self.flatten_embedding:
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
+ return x
+
+ def flops(self) -> float:
+ Ho, Wo = self.patches_resolution
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+ if self.norm is not None:
+ flops += Ho * Wo * self.embed_dim
+ return flops
+
+class Mlp(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ drop: float = 0.0,
+ bias: bool = True,
+ tuning_mode: Optional[int] = None
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+ self.drop = nn.Dropout(drop)
+
+ if tuning_mode != None:
+ self.tuning_mode = tuning_mode
+ if tuning_mode == 'ssf':
+ self.ssf_scale_1, self.ssf_shift_1 = init_ssf_scale_shift(hidden_features)
+ self.ssf_scale_2, self.ssf_shift_2 = init_ssf_scale_shift(out_features)
+ else:
+ pass
+ #raise NotImplementedError()
+ else:
+ self.tuning_mode = None
+
+ def forward(self, x: Tensor) -> Tensor:
+ x = self.fc1(x)
+ if self.tuning_mode == 'ssf':
+ x = ssf_ada(x, self.ssf_scale_1, self.ssf_shift_1)
+
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ if self.tuning_mode == 'ssf':
+ x = ssf_ada(x, self.ssf_scale_2, self.ssf_shift_2)
+
+ x = self.drop(x)
+ return x
+
+
+class SwiGLUFFN(nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ tuning_mode: Optional[int] = None
+ ) -> None:
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+ if tuning_mode != None:
+ self.tuning_mode = tuning_mode
+ if tuning_mode == 'ssf':
+ self.ssf_scale_1, self.ssf_shift_1 = init_ssf_scale_shift(2 * hidden_features)
+ self.ssf_scale_2, self.ssf_shift_2 = init_ssf_scale_shift(out_features)
+ else:
+ pass
+ #raise NotImplementedError()
+ else:
+ self.tuning_mode = None
+
+
+ def forward(self, x: Tensor) -> Tensor:
+ x12 = self.w12(x)
+ if self.tuning_mode == 'ssf':
+ x12 = ssf_ada(x12, self.ssf_scale_1, self.ssf_shift_1)
+
+ x1, x2 = x12.chunk(2, dim=-1)
+ hidden = F.silu(x1) * x2
+ out = self.w3(hidden)
+
+ if self.tuning_mode == 'ssf':
+ out = ssf_ada(out, self.ssf_scale_2, self.ssf_scale_2)
+
+ return out
+
+
+try:
+ from xformers.ops import SwiGLU
+ #import numpy.bool
+ XFORMERS_AVAILABLE = True
+except ImportError:
+ SwiGLU = SwiGLUFFN
+ XFORMERS_AVAILABLE = False
+
+class SwiGLUFFNFused(SwiGLU):
+ def __init__(
+ self,
+ in_features: int,
+ hidden_features: Optional[int] = None,
+ out_features: Optional[int] = None,
+ act_layer: Callable[..., nn.Module] = None,
+ drop: float = 0.0,
+ bias: bool = True,
+ ) -> None:
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+ super().__init__(
+ in_features=in_features,
+ hidden_features=hidden_features,
+ out_features=out_features,
+ bias=bias,
+ )
+
+
+XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ attn_drop: float = 0.0,
+ proj_drop: float = 0.0,
+ window_size: int = 0,
+ tuning_mode: Optional[int] = None
+ ) -> None:
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim**-0.5
+
+ if tuning_mode == 'lora':
+ self.tuning_mode = tuning_mode
+ self.qkv = LoRALinear(dim, dim * 3, bias=qkv_bias, r=8)
+ else:
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+
+ self.attn_drop = nn.Dropout(attn_drop)
+
+ if tuning_mode == 'lora':
+ self.tuning_mode = tuning_mode
+ self.proj = LoRALinear(dim, dim, bias=proj_bias, r=8)
+ else:
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ if tuning_mode != None:
+ self.tuning_mode = tuning_mode
+ if tuning_mode == 'ssf':
+ self.ssf_scale_1, self.ssf_shift_1 = init_ssf_scale_shift(dim * 3)
+ self.ssf_scale_2, self.ssf_shift_2 = init_ssf_scale_shift(dim)
+ else:
+ pass
+ #raise NotImplementedError()
+ else:
+ self.tuning_mode = None
+
+ #if not self.training:
+ #
+ # self.attn = ScaledDotProduct()
+ #self.attn = MultiHeadDispatch(dim_model=EMB, residual_dropout=DROPOUT, num_heads=HEADS, attention=attn)
+
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ B, N, C = x.shape
+ if self.tuning_mode == 'ssf':
+ qkv = ssf_ada(self.qkv(x), self.ssf_scale_1, self.ssf_shift_1).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ else:
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+ attn = q @ k.transpose(-2, -1)
+
+ if attn_bias is not None:
+ attn = attn + attn_bias[:, :, :N]
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+
+ if self.tuning_mode == 'ssf':
+ x = ssf_ada(x, self.ssf_scale_2, self.ssf_shift_2)
+
+ x = self.proj_drop(x)
+ return x
+
+
+class MemEffAttention(Attention):
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ if not XFORMERS_AVAILABLE:
+ #if True:
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
+ return super().forward(x, attn_bias)
+
+ B, N, C = x.shape
+ if self.tuning_mode == 'ssf':
+ qkv = ssf_ada(self.qkv(x), self.ssf_scale_1, self.ssf_shift_1).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+ else:
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+ q, k, v = unbind(qkv, 2)
+ if attn_bias is not None:
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias[:, :, :N])
+ else:
+ x = memory_efficient_attention(q, k, v)
+ x = x.reshape([B, N, C])
+
+ x = self.proj(x)
+ if self.tuning_mode == 'ssf':
+ x = ssf_ada(x, self.ssf_scale_2, self.ssf_shift_2)
+
+ x = self.proj_drop(x)
+ return x
+
+XFORMERS_AVAILABLE = False
+
+class Block(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ qkv_bias: bool = False,
+ proj_bias: bool = True,
+ ffn_bias: bool = True,
+ drop: float = 0.0,
+ attn_drop: float = 0.0,
+ init_values = None,
+ drop_path: float = 0.0,
+ act_layer: Callable[..., nn.Module] = nn.GELU,
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+ attn_class: Callable[..., nn.Module] = Attention,
+ ffn_layer: Callable[..., nn.Module] = Mlp,
+ tuning_mode: Optional[int] = None
+ ) -> None:
+ super().__init__()
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+ self.norm1 = norm_layer(dim)
+ self.attn = attn_class(
+ dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ attn_drop=attn_drop,
+ proj_drop=drop,
+ tuning_mode=tuning_mode
+ )
+
+ if tuning_mode != None:
+ self.tuning_mode = tuning_mode
+ if tuning_mode == 'ssf':
+ self.ssf_scale_1, self.ssf_shift_1 = init_ssf_scale_shift(dim)
+ self.ssf_scale_2, self.ssf_shift_2 = init_ssf_scale_shift(dim)
+ else:
+ pass
+ #raise NotImplementedError()
+ else:
+ self.tuning_mode = None
+
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = ffn_layer(
+ in_features=dim,
+ hidden_features=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=drop,
+ bias=ffn_bias,
+ )
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.sample_drop_ratio = drop_path
+
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+ def attn_residual_func(x: Tensor, attn_bias) -> Tensor:
+ if self.tuning_mode == 'ssf':
+ return self.ls1(self.attn(ssf_ada(self.norm1(x), self.ssf_scale_1, self.ssf_shift_1), attn_bias))
+ else:
+ return self.ls1(self.attn(self.norm1(x), attn_bias))
+
+ def ffn_residual_func(x: Tensor) -> Tensor:
+ if self.tuning_mode == 'ssf':
+ return self.ls2(self.mlp(ssf_ada(self.norm2(x), self.ssf_scale_2, self.ssf_shift_2)))
+ else:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ if self.training and self.sample_drop_ratio > 0.1:
+ # the overhead is compensated only for a drop path rate larger than 0.1
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ attn_bias=attn_bias
+ )
+ x = drop_add_residual_stochastic_depth(
+ x,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ )
+ elif self.training and self.sample_drop_ratio > 0.0:
+ x = x + self.drop_path1(attn_residual_func(x, attn_bias))
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
+ else:
+ x = x + attn_residual_func(x, attn_bias)
+ x = x + ffn_residual_func(x)
+ return x
+
+
+def drop_add_residual_stochastic_depth(
+ x: Tensor,
+ residual_func: Callable[[Tensor], Tensor],
+ sample_drop_ratio: float = 0.0, attn_bias=None
+) -> Tensor:
+ # 1) extract subset using permutation
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ x_subset = x[brange]
+
+ # 2) apply residual_func to get residual
+ residual = residual_func(x_subset, attn_bias)
+
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+
+ residual_scale_factor = b / sample_subset_size
+
+ # 3) add the residual
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+ b, n, d = x.shape
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+ residual_scale_factor = b / sample_subset_size
+ return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+ if scaling_vector is None:
+ x_flat = x.flatten(1)
+ residual = residual.flatten(1)
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+ else:
+ x_plus_residual = scaled_index_add(
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+ )
+ return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+ """
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
+ """
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+ if all_shapes not in attn_bias_cache.keys():
+ seqlens = []
+ for b, x in zip(batch_sizes, x_list):
+ for _ in range(b):
+ seqlens.append(x.shape[1])
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+ attn_bias._batch_sizes = batch_sizes
+ attn_bias_cache[all_shapes] = attn_bias
+
+ if branges is not None:
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+ else:
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+ return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+ x_list: List[Tensor],
+ residual_func: Callable[[Tensor, Any], Tensor],
+ sample_drop_ratio: float = 0.0,
+ scaling_vector=None,
+) -> Tensor:
+ # 1) generate random set of indices for dropping samples in the batch
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+ branges = [s[0] for s in branges_scales]
+ residual_scale_factors = [s[1] for s in branges_scales]
+
+ # 2) get attention bias and index+concat the tensors
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+ # 3) apply residual_func to get residual, and split the result
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
+
+ outputs = []
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+ return outputs
+
+
+class NestedTensorBlock(Block):
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+ """
+ x_list contains a list of tensors to nest together and run
+ """
+ assert isinstance(self.attn, MemEffAttention)
+
+ if self.training and self.sample_drop_ratio > 0.0:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.mlp(self.norm2(x))
+
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=attn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ x_list = drop_add_residual_stochastic_depth_list(
+ x_list,
+ residual_func=ffn_residual_func,
+ sample_drop_ratio=self.sample_drop_ratio,
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+ )
+ return x_list
+ else:
+
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+ return self.ls2(self.mlp(self.norm2(x)))
+
+ attn_bias, x = get_attn_bias_and_cat(x_list)
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
+ x = x + ffn_residual_func(x)
+ return attn_bias.split(x)
+
+ def forward(self, x_or_x_list, attn_bias=None):
+ if isinstance(x_or_x_list, Tensor):
+ return super().forward(x_or_x_list, attn_bias)
+ elif isinstance(x_or_x_list, list):
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+ return self.forward_nested(x_or_x_list)
+ else:
+ raise AssertionError
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+ if not depth_first and include_root:
+ fn(module=module, name=name)
+ for child_name, child_module in module.named_children():
+ child_name = ".".join((name, child_name)) if name else child_name
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+ if depth_first and include_root:
+ fn(module=module, name=name)
+ return module
+
+
+class BlockChunk(nn.ModuleList):
+ def forward(self, x, others=None):
+ for b in self:
+ if others == None:
+ x = b(x)
+ else:
+ x = b(x, others)
+ return x
+
+
+class DinoVisionTransformer(nn.Module):
+ def __init__(
+ self,
+ img_size=518,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4.0,
+ qkv_bias=True,
+ ffn_bias=True,
+ proj_bias=True,
+ drop_path_rate=0.0,
+ drop_path_uniform=False,
+ init_values=1e-5, # for layerscale: None or 0 => no layerscale
+ embed_layer=PatchEmbed,
+ act_layer=nn.GELU,
+ block_fn=Block,
+ ffn_layer="mlp",
+ block_chunks=1,
+ num_register_tokens=0,
+ interpolate_antialias=False,
+ interpolate_offset=0.1,
+ multi_output=False,
+ tuning_mode=None,
+ **kwargs
+ ):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ proj_bias (bool): enable bias for proj in attn if True
+ ffn_bias (bool): enable bias for ffn if True
+ drop_path_rate (float): stochastic depth rate
+ drop_path_uniform (bool): apply uniform drop rate across blocks
+ weight_init (str): weight init scheme
+ init_values (float): layer-scale init values
+ embed_layer (nn.Module): patch embedding layer
+ act_layer (nn.Module): MLP activation layer
+ block_fn (nn.Module): transformer block class
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+ """
+ super().__init__()
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 1
+ self.n_blocks = depth
+ self.num_heads = num_heads
+ self.patch_size = patch_size
+ self.num_register_tokens = num_register_tokens
+ self.interpolate_antialias = interpolate_antialias
+ self.interpolate_offset = interpolate_offset
+
+ if tuning_mode != None:
+ self.tuning_mode = tuning_mode
+ if tuning_mode == 'ssf':
+ self.ssf_scale_1, self.ssf_shift_1 = init_ssf_scale_shift(embed_dim)
+ else:
+ pass
+ #raise NotImplementedError()
+ else:
+ self.tuning_mode = None
+ tuning_mode_list = [tuning_mode] * depth
+
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, tuning_mode=tuning_mode)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+ self.multi_output = multi_output
+ assert num_register_tokens >= 0
+ self.register_tokens = (
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+ )
+
+ if drop_path_uniform is True:
+ dpr = [drop_path_rate] * depth
+ else:
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+
+ if ffn_layer == "mlp":
+ logger.info("using MLP layer as FFN")
+ ffn_layer = Mlp
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+ logger.info("using SwiGLU layer as FFN")
+ ffn_layer = SwiGLUFFNFused
+ elif ffn_layer == "identity":
+ logger.info("using Identity layer as FFN")
+
+ def f(*args, **kwargs):
+ return nn.Identity()
+
+ ffn_layer = f
+ else:
+ raise NotImplementedError
+
+ blocks_list = [
+ block_fn(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ proj_bias=proj_bias,
+ ffn_bias=ffn_bias,
+ drop_path=dpr[i],
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ ffn_layer=ffn_layer,
+ init_values=init_values,
+ tuning_mode=tuning_mode_list[i]
+ )
+ for i in range(depth)
+ ]
+ if block_chunks > 0:
+ self.chunked_blocks = True
+ chunked_blocks = []
+ chunksize = depth // block_chunks
+ for i in range(0, depth, chunksize):
+ # this is to keep the block index consistent if we chunk the block list
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+ else:
+ self.chunked_blocks = False
+ self.blocks = nn.ModuleList(blocks_list)
+
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Identity()
+
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+ self.init_weights()
+
+ def init_weights(self):
+ trunc_normal_(self.pos_embed, std=0.02)
+ nn.init.normal_(self.cls_token, std=1e-6)
+ if self.register_tokens is not None:
+ nn.init.normal_(self.register_tokens, std=1e-6)
+ named_apply(init_weights_vit_timm, self)
+
+ def interpolate_pos_encoding(self, x, w, h):
+ previous_dtype = x.dtype
+ npatch = x.shape[1] - 1
+ N = self.pos_embed.shape[1] - 1
+ if npatch == N and w == h:
+ return self.pos_embed
+ pos_embed = self.pos_embed.float()
+ class_pos_embed = pos_embed[:, 0]
+ patch_pos_embed = pos_embed[:, 1:]
+ dim = x.shape[-1]
+ w0 = w // self.patch_size
+ h0 = h // self.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+
+ sqrt_N = math.sqrt(N)
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+ scale_factor=(sx, sy),
+ mode="bicubic",
+ antialias=self.interpolate_antialias,
+ )
+
+ assert int(w0) == patch_pos_embed.shape[-2]
+ assert int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+ def prepare_tokens_with_masks(self, x, masks=None):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x)
+ if masks is not None:
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+ x = x + self.interpolate_pos_encoding(x, w, h)
+
+ if self.register_tokens is not None:
+ x = torch.cat(
+ (
+ x[:, :1],
+ self.register_tokens.expand(x.shape[0], -1, -1),
+ x[:, 1:],
+ ),
+ dim=1,
+ )
+
+ return x
+
+ def forward_features_list(self, x_list, masks_list):
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+ for blk in self.blocks:
+ x = blk(x)
+
+ all_x = x
+ output = []
+ for x, masks in zip(all_x, masks_list):
+ x_norm = self.norm(x)
+ output.append(
+ {
+ "x_norm_clstoken": x_norm[:, 0],
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+ "x_prenorm": x,
+ "masks": masks,
+ }
+ )
+ return output
+
+ def forward_features(self, x, masks=None):
+ if isinstance(x, list):
+ return self.forward_features_list(x, masks)
+
+ B, C, H, W = x.size()
+ pad_h = (self.patch_size - H % self.patch_size)
+ pad_w = (self.patch_size - W % self.patch_size)
+ if pad_h == self.patch_size:
+ pad_h = 0
+ if pad_w == self.patch_size:
+ pad_w = 0
+ #x = nn.functional.pad(x, (pad_h//2, pad_h-pad_h//2, pad_w//2, pad_w-pad_w//2))
+ if pad_h + pad_w > 0:
+ x = torch.nn.functional.interpolate(x, (H+pad_h, W+pad_w), mode='bilinear')
+
+ x = self.prepare_tokens_with_masks(x, masks)
+
+ #for blk in self.blocks:
+ #x = blk(x)
+
+ #x_norm = self.norm(x)
+ #if self.tuning_mode == 'ssf':
+ #x_norm = ssf_ada(x_norm, self.ssf_scale_1, self.ssf_shift_1)
+
+ # return {
+ # "x_norm_clstoken": x_norm[:, 0],
+ # "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+ # "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+ # "x_prenorm": x,
+ # "masks": masks,
+ # }
+ # features = []
+ # features.append(x_norm)
+ # features.append(x_norm)
+ # features.append(x_norm)
+ # features.append(x_norm)
+ # return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W, self.num_register_tokens)]
+
+ if self.multi_output == False:
+ for blk in self.blocks:
+ x = blk(x)
+ x_norm = self.norm(x)
+ if self.tuning_mode == 'ssf':
+ x_norm = ssf_ada(x_norm, self.ssf_scale_1, self.ssf_shift_1)
+
+ features = []
+ features.append(x_norm)
+ features.append(x_norm)
+ features.append(x_norm)
+ features.append(x_norm)
+ return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W, self.num_register_tokens)]
+ else:
+ features = []
+ for blk in self.blocks:
+ for idx, sub_blk in enumerate(blk):
+ x = sub_blk(x)
+ if (idx + 1) % (len(blk) // 4) == 0:
+ features.append(x)
+
+ return [features, (B, (H+pad_h)//self.patch_size, (W+pad_w)//self.patch_size, H, W, self.num_register_tokens)]
+
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ # If n is an int, take the n last blocks. If it's a list, take them
+ output, total_block_len = [], len(self.blocks)
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def _get_intermediate_layers_chunked(self, x, n=1):
+ x = self.prepare_tokens_with_masks(x)
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
+ # If n is an int, take the n last blocks. If it's a list, take them
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+ for block_chunk in self.blocks:
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
+ x = blk(x)
+ if i in blocks_to_take:
+ output.append(x)
+ i += 1
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+ return output
+
+ def get_intermediate_layers(
+ self,
+ x: torch.Tensor,
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
+ reshape: bool = False,
+ return_class_token: bool = False,
+ norm=True,
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+ if self.chunked_blocks:
+ outputs = self._get_intermediate_layers_chunked(x, n)
+ else:
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
+ if norm:
+ outputs = [self.norm(out) for out in outputs]
+ class_tokens = [out[:, 0] for out in outputs]
+ outputs = [out[:, 1:] for out in outputs]
+ if reshape:
+ B, _, w, h = x.shape
+ outputs = [
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+ for out in outputs
+ ]
+ if return_class_token:
+ return tuple(zip(outputs, class_tokens))
+ return tuple(outputs)
+
+ def forward(self, *args, is_training=False, **kwargs):
+ ret = self.forward_features(*args, **kwargs)
+ return ret
+ # if is_training:
+ # return ret
+ # else:
+ # return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+ """ViT weight initialization, original timm impl (for reproducibility)"""
+ if isinstance(module, nn.Linear):
+ trunc_normal_(module.weight, std=0.02)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+
+def load_ckpt_dino(checkpoint, model):
+ if checkpoint is not None:
+ try:
+ with open(checkpoint, "rb") as f:
+ state_dict = torch.load(f)
+ except:
+ print('NO pretrained imagenet ckpt available! Check your path!')
+ del model.mask_token
+ return
+
+ try:
+ model.load_state_dict(state_dict, strict=True)
+ except:
+ new_state_dict = {}
+ for key, value in state_dict.items():
+ if 'blocks' in key:
+ key_new = 'blocks.0' + key[len('blocks'):]
+ else:
+ key_new = key
+ new_state_dict[key_new] = value
+
+ model.load_state_dict(new_state_dict, strict=True)
+ del model.mask_token
+ return
+ else:
+ return
+
+
+def vit_small(patch_size=14, num_register_tokens=0, checkpoint=None, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+
+ load_ckpt_dino(checkpoint, model)
+
+ return model
+
+
+def vit_base(patch_size=14, num_register_tokens=0, checkpoint=None, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+ return model
+
+
+def vit_large(patch_size=14, num_register_tokens=0, checkpoint=None, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+
+ if checkpoint is not None:
+ with open(checkpoint, "rb") as f:
+ state_dict = torch.load(f)
+ try:
+ model.load_state_dict(state_dict, strict=True)
+ except:
+ new_state_dict = {}
+ for key, value in state_dict.items():
+ if 'blocks' in key:
+ key_new = 'blocks.0' + key[len('blocks'):]
+ else:
+ key_new = key
+ new_state_dict[key_new] = value
+
+ model.load_state_dict(new_state_dict, strict=True)
+ del model.mask_token
+ return model
+
+
+def vit_giant2(patch_size=14, num_register_tokens=0, checkpoint=None, **kwargs):
+ """
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+ """
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1536,
+ depth=40,
+ num_heads=24,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ ffn_layer='swiglu',
+ **kwargs,
+ )
+ return model
+
+
+
+def vit_small_reg(patch_size=14, num_register_tokens=4, checkpoint=None, tuning_mode=None, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ tuning_mode=tuning_mode,
+ **kwargs,
+ )
+
+ load_ckpt_dino(checkpoint, model)
+
+ return model
+
+
+def vit_base_reg(patch_size=14, num_register_tokens=4, checkpoint=None, **kwargs):
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=768,
+ depth=12,
+ num_heads=12,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ **kwargs,
+ )
+
+ load_ckpt_dino(checkpoint, model)
+
+ return model
+
+
+def vit_large_reg(patch_size=14, num_register_tokens=4, checkpoint=None, tuning_mode=None, **kwargs):
+ model = DinoVisionTransformer(
+ img_size = 518,
+ patch_size=patch_size,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ tuning_mode=tuning_mode,
+ **kwargs,
+ )
+
+ load_ckpt_dino(checkpoint, model)
+
+ return model
+
+
+def vit_giant2_reg(patch_size=14, num_register_tokens=4, checkpoint=None, tuning_mode=None, **kwargs):
+ """
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+ """
+ model = DinoVisionTransformer(
+ patch_size=patch_size,
+ embed_dim=1536,
+ depth=40,
+ num_heads=24,
+ mlp_ratio=4,
+ block_fn=partial(Block, attn_class=MemEffAttention),
+ num_register_tokens=num_register_tokens,
+ ffn_layer='swiglu',
+ tuning_mode=tuning_mode,
+ multi_output=True,
+ **kwargs,
+ )
+
+ load_ckpt_dino(checkpoint, model)
+
+ return model
+
+if __name__ == '__main__':
+ try:
+ from custom_mmpkg.custom_mmcv.utils import Config
+ except:
+ from mmengine import Config
+
+ #rgb = torch.rand((2, 3, 518, 518)).cuda()
+
+ #cfg.data_basic['crop_size']['0']
+ #cfg.data_basic['crop_size']['1']
+ cfg = Config.fromfile('/mu.hu/projects/monodepth_vit/mono/configs/RAFTDecoder/vit.raft5.large.kitti.py')
+
+ #rgb = torch.arange(0, 2*3*1036*1036, 1).cuda().float().view(2, 3, 1036, 1036)
+ rgb = torch.zeros(1, 3, 616, 1064).cuda()
+ cfg['tuning_mode'] = 'ssf'
+ #model = vit_large_reg(checkpoint="/cpfs02/shared/public/groups/local_map/yvan/pretrained_weight_repo/vit/dinov2_vitl14_reg4_pretrain.pth", kwarg=cfg).cuda()
+ model = vit_large_reg(tuning_mode='ssf').cuda()
+
+ #import timm
+ #model2 = timm.models.vision_transformer.vit_large_patch14_dinov2().cuda()
+ #timm.models.load_checkpoint(model2, '/cpfs02/shared/public/yvan/pretrained_weight_repo/vit/dinov2_vitl14_pretrain.pth', filter_fn=timm.models.vision_transformer.checkpoint_filter_fn)
+
+ out1 = model(rgb)
+ #out2 = model2(rgb)
+ temp = 0
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb8dd710a7b257f3ce3067010accf9f970aee9c4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/backbones/__init__.py
@@ -0,0 +1,11 @@
+from .ConvNeXt import convnext_xlarge
+from .ConvNeXt import convnext_small
+from .ConvNeXt import convnext_base
+from .ConvNeXt import convnext_large
+from .ConvNeXt import convnext_tiny
+from .ViT_DINO import vit_large
+from .ViT_DINO_reg import vit_small_reg, vit_large_reg, vit_giant2_reg
+
+__all__ = [
+ 'convnext_xlarge', 'convnext_small', 'convnext_base', 'convnext_large', 'convnext_tiny', 'vit_small_reg', 'vit_large_reg', 'vit_giant2_reg'
+]
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/HourGlassDecoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/HourGlassDecoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7550b776a4e850cbf54aeea825eb5d0129ee06
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/HourGlassDecoder.py
@@ -0,0 +1,274 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+import torch.nn.functional as F
+
+def compute_depth_expectation(prob, depth_values):
+ depth_values = depth_values.view(*depth_values.shape, 1, 1)
+ depth = torch.sum(prob * depth_values, 1)
+ return depth
+
+class ConvBlock(nn.Module):
+ def __init__(self, in_channels, out_channels, kernel_size=3):
+ super(ConvBlock, self).__init__()
+
+ if kernel_size == 3:
+ self.conv = nn.Sequential(
+ nn.ReflectionPad2d(1),
+ nn.Conv2d(in_channels, out_channels, 3, padding=0, stride=1),
+ )
+ elif kernel_size == 1:
+ self.conv = nn.Conv2d(int(in_channels), int(out_channels), 1, padding=0, stride=1)
+
+ self.nonlin = nn.ELU(inplace=True)
+
+ def forward(self, x):
+ out = self.conv(x)
+ out = self.nonlin(out)
+ return out
+
+
+class ConvBlock_double(nn.Module):
+ def __init__(self, in_channels, out_channels, kernel_size=3):
+ super(ConvBlock_double, self).__init__()
+
+ if kernel_size == 3:
+ self.conv = nn.Sequential(
+ nn.ReflectionPad2d(1),
+ nn.Conv2d(in_channels, out_channels, 3, padding=0, stride=1),
+ )
+ elif kernel_size == 1:
+ self.conv = nn.Conv2d(int(in_channels), int(out_channels), 1, padding=0, stride=1)
+
+ self.nonlin = nn.ELU(inplace=True)
+ self.conv_2 = nn.Conv2d(out_channels, out_channels, 1, padding=0, stride=1)
+ self.nonlin_2 =nn.ELU(inplace=True)
+
+ def forward(self, x):
+ out = self.conv(x)
+ out = self.nonlin(out)
+ out = self.conv_2(out)
+ out = self.nonlin_2(out)
+ return out
+
+class DecoderFeature(nn.Module):
+ def __init__(self, feat_channels, num_ch_dec=[64, 64, 128, 256]):
+ super(DecoderFeature, self).__init__()
+ self.num_ch_dec = num_ch_dec
+ self.feat_channels = feat_channels
+
+ self.upconv_3_0 = ConvBlock(self.feat_channels[3], self.num_ch_dec[3], kernel_size=1)
+ self.upconv_3_1 = ConvBlock_double(
+ self.feat_channels[2] + self.num_ch_dec[3],
+ self.num_ch_dec[3],
+ kernel_size=1)
+
+ self.upconv_2_0 = ConvBlock(self.num_ch_dec[3], self.num_ch_dec[2], kernel_size=3)
+ self.upconv_2_1 = ConvBlock_double(
+ self.feat_channels[1] + self.num_ch_dec[2],
+ self.num_ch_dec[2],
+ kernel_size=3)
+
+ self.upconv_1_0 = ConvBlock(self.num_ch_dec[2], self.num_ch_dec[1], kernel_size=3)
+ self.upconv_1_1 = ConvBlock_double(
+ self.feat_channels[0] + self.num_ch_dec[1],
+ self.num_ch_dec[1],
+ kernel_size=3)
+ self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+ def forward(self, ref_feature):
+ x = ref_feature[3]
+
+ x = self.upconv_3_0(x)
+ x = torch.cat((self.upsample(x), ref_feature[2]), 1)
+ x = self.upconv_3_1(x)
+
+ x = self.upconv_2_0(x)
+ x = torch.cat((self.upsample(x), ref_feature[1]), 1)
+ x = self.upconv_2_1(x)
+
+ x = self.upconv_1_0(x)
+ x = torch.cat((self.upsample(x), ref_feature[0]), 1)
+ x = self.upconv_1_1(x)
+ return x
+
+
+class UNet(nn.Module):
+ def __init__(self, inp_ch=32, output_chal=1, down_sample_times=3, channel_mode='v0'):
+ super(UNet, self).__init__()
+ basic_block = ConvBnReLU
+ num_depth = 128
+
+ self.conv0 = basic_block(inp_ch, num_depth)
+ if channel_mode == 'v0':
+ channels = [num_depth, num_depth//2, num_depth//4, num_depth//8, num_depth // 8]
+ elif channel_mode == 'v1':
+ channels = [num_depth, num_depth, num_depth, num_depth, num_depth, num_depth]
+ self.down_sample_times = down_sample_times
+ for i in range(down_sample_times):
+ setattr(
+ self, 'conv_%d' % i,
+ nn.Sequential(
+ basic_block(channels[i], channels[i+1], stride=2),
+ basic_block(channels[i+1], channels[i+1])
+ )
+ )
+ for i in range(down_sample_times-1,-1,-1):
+ setattr(self, 'deconv_%d' % i,
+ nn.Sequential(
+ nn.ConvTranspose2d(
+ channels[i+1],
+ channels[i],
+ kernel_size=3,
+ padding=1,
+ output_padding=1,
+ stride=2,
+ bias=False),
+ nn.BatchNorm2d(channels[i]),
+ nn.ReLU(inplace=True)
+ )
+ )
+ self.prob = nn.Conv2d(num_depth, output_chal, 1, stride=1, padding=0)
+
+ def forward(self, x):
+ features = {}
+ conv0 = self.conv0(x)
+ x = conv0
+ features[0] = conv0
+ for i in range(self.down_sample_times):
+ x = getattr(self, 'conv_%d' % i)(x)
+ features[i+1] = x
+ for i in range(self.down_sample_times-1,-1,-1):
+ x = features[i] + getattr(self, 'deconv_%d' % i)(x)
+ x = self.prob(x)
+ return x
+
+class ConvBnReLU(nn.Module):
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, pad=1):
+ super(ConvBnReLU, self).__init__()
+ self.conv = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=pad,
+ bias=False
+ )
+ self.bn = nn.BatchNorm2d(out_channels)
+
+ def forward(self, x):
+ return F.relu(self.bn(self.conv(x)), inplace=True)
+
+
+class HourglassDecoder(nn.Module):
+ def __init__(self, cfg):
+ super(HourglassDecoder, self).__init__()
+ self.inchannels = cfg.model.decode_head.in_channels # [256, 512, 1024, 2048]
+ self.decoder_channels = cfg.model.decode_head.decoder_channel # [64, 64, 128, 256]
+ self.min_val = cfg.data_basic.depth_normalize[0]
+ self.max_val = cfg.data_basic.depth_normalize[1]
+
+ self.num_ch_dec = self.decoder_channels # [64, 64, 128, 256]
+ self.num_depth_regressor_anchor = 512
+ self.feat_channels = self.inchannels
+ unet_in_channel = self.num_ch_dec[1]
+ unet_out_channel = 256
+
+ self.decoder_mono = DecoderFeature(self.feat_channels, self.num_ch_dec)
+ self.conv_out_2 = UNet(inp_ch=unet_in_channel,
+ output_chal=unet_out_channel + 1,
+ down_sample_times=3,
+ channel_mode='v0',
+ )
+
+ self.depth_regressor_2 = nn.Sequential(
+ nn.Conv2d(unet_out_channel,
+ self.num_depth_regressor_anchor,
+ kernel_size=3,
+ padding=1,
+ ),
+ nn.BatchNorm2d(self.num_depth_regressor_anchor),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(
+ self.num_depth_regressor_anchor,
+ self.num_depth_regressor_anchor,
+ kernel_size=1,
+ )
+ )
+ self.residual_channel = 16
+ self.conv_up_2 = nn.Sequential(
+ nn.Conv2d(1 + 2 + unet_out_channel, self.residual_channel, 3, padding=1),
+ nn.BatchNorm2d(self.residual_channel),
+ nn.ReLU(),
+ nn.Conv2d(self.residual_channel, self.residual_channel, 3, padding=1),
+ nn.Upsample(scale_factor=4),
+ nn.Conv2d(self.residual_channel, self.residual_channel, 3, padding=1),
+ nn.ReLU(),
+ nn.Conv2d(self.residual_channel, 1, 1, padding=0),
+ )
+
+ def get_bins(self, bins_num):
+ depth_bins_vec = torch.linspace(math.log(self.min_val), math.log(self.max_val), bins_num, device='cuda')
+ depth_bins_vec = torch.exp(depth_bins_vec)
+ return depth_bins_vec
+
+ def register_depth_expectation_anchor(self, bins_num, B):
+ depth_bins_vec = self.get_bins(bins_num)
+ depth_bins_vec = depth_bins_vec.unsqueeze(0).repeat(B, 1)
+ self.register_buffer('depth_expectation_anchor', depth_bins_vec, persistent=False)
+
+ def upsample(self, x, scale_factor=2):
+ return F.interpolate(x, scale_factor=scale_factor, mode='nearest')
+
+ def regress_depth_2(self, feature_map_d):
+ prob = self.depth_regressor_2(feature_map_d).softmax(dim=1)
+ B = prob.shape[0]
+ if "depth_expectation_anchor" not in self._buffers:
+ self.register_depth_expectation_anchor(self.num_depth_regressor_anchor, B)
+ d = compute_depth_expectation(
+ prob,
+ self.depth_expectation_anchor[:B, ...]
+ ).unsqueeze(1)
+ return d
+
+ def create_mesh_grid(self, height, width, batch, device="cuda", set_buffer=True):
+ y, x = torch.meshgrid([torch.arange(0, height, dtype=torch.float32, device=device),
+ torch.arange(0, width, dtype=torch.float32, device=device)], indexing='ij')
+ meshgrid = torch.stack((x, y))
+ meshgrid = meshgrid.unsqueeze(0).repeat(batch, 1, 1, 1)
+ return meshgrid
+
+ def forward(self, features_mono, **kwargs):
+ '''
+ trans_ref2src: list of transformation matrix from the reference view to source view. [B, 4, 4]
+ inv_intrinsic_pool: list of inverse intrinsic matrix.
+ features_mono: features of reference and source views. [[ref_f1, ref_f2, ref_f3, ref_f4],[src1_f1, src1_f2, src1_f3, src1_f4], ...].
+ '''
+ outputs = {}
+ # get encoder feature of the reference view
+ ref_feat = features_mono
+
+ feature_map_mono = self.decoder_mono(ref_feat)
+ feature_map_mono_pred = self.conv_out_2(feature_map_mono)
+ confidence_map_2 = feature_map_mono_pred[:, -1:, :, :]
+ feature_map_d_2 = feature_map_mono_pred[:, :-1, :, :]
+
+ depth_pred_2 = self.regress_depth_2(feature_map_d_2)
+
+ B, _, H, W = depth_pred_2.shape
+
+ meshgrid = self.create_mesh_grid(H, W, B)
+
+ depth_pred_mono = self.upsample(depth_pred_2, scale_factor=4) + 1e-1 * \
+ self.conv_up_2(
+ torch.cat((depth_pred_2, meshgrid[:B, ...], feature_map_d_2), 1)
+ )
+ confidence_map_mono = self.upsample(confidence_map_2, scale_factor=4)
+
+ outputs=dict(
+ prediction=depth_pred_mono,
+ confidence=confidence_map_mono,
+ pred_logit=None,
+ )
+ return outputs
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py
new file mode 100644
index 0000000000000000000000000000000000000000..d34062ed323d7508bd9f362ab96bfac065d77424
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py
@@ -0,0 +1,1031 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+import torch.nn.functional as F
+
+# LORA finetuning originally by edwardjhu
+class LoRALayer():
+ def __init__(
+ self,
+ r: int,
+ lora_alpha: int,
+ lora_dropout: float,
+ merge_weights: bool,
+ ):
+ self.r = r
+ self.lora_alpha = lora_alpha
+ # Optional dropout
+ if lora_dropout > 0.:
+ self.lora_dropout = nn.Dropout(p=lora_dropout)
+ else:
+ self.lora_dropout = lambda x: x
+ # Mark the weight as unmerged
+ self.merged = False
+ self.merge_weights = merge_weights
+
+class LoRALinear(nn.Linear, LoRALayer):
+ # LoRA implemented in a dense layer
+ def __init__(
+ self,
+ in_features: int,
+ out_features: int,
+ r: int = 0,
+ lora_alpha: int = 1,
+ lora_dropout: float = 0.,
+ fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+ merge_weights: bool = True,
+ **kwargs
+ ):
+ nn.Linear.__init__(self, in_features, out_features, **kwargs)
+ LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
+ merge_weights=merge_weights)
+
+ self.fan_in_fan_out = fan_in_fan_out
+ # Actual trainable parameters
+ if r > 0:
+ self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
+ self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
+ self.scaling = self.lora_alpha / self.r
+ # Freezing the pre-trained weight matrix
+ self.weight.requires_grad = False
+ self.reset_parameters()
+ if fan_in_fan_out:
+ self.weight.data = self.weight.data.transpose(0, 1)
+
+ def reset_parameters(self):
+ #nn.Linear.reset_parameters(self)
+ if hasattr(self, 'lora_A'):
+ # initialize B the same way as the default for nn.Linear and A to zero
+ # this is different than what is described in the paper but should not affect performance
+ nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+ nn.init.zeros_(self.lora_B)
+
+ # def train(self, mode: bool = True):
+ # def T(w):
+ # return w.transpose(0, 1) if self.fan_in_fan_out else w
+ # nn.Linear.train(self, mode)
+ # if mode:
+ # if self.merge_weights and self.merged:
+ # # Make sure that the weights are not merged
+ # if self.r > 0:
+ # self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+ # self.merged = False
+ # else:
+ # if self.merge_weights and not self.merged:
+ # # Merge the weights and mark it
+ # if self.r > 0:
+ # self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+ # self.merged = True
+
+ def forward(self, x: torch.Tensor):
+ def T(w):
+ return w.transpose(0, 1) if self.fan_in_fan_out else w
+ if self.r > 0 and not self.merged:
+ result = F.linear(x, T(self.weight), bias=self.bias)
+ result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
+ return result
+ else:
+ return F.linear(x, T(self.weight), bias=self.bias)
+
+class ConvLoRA(nn.Conv2d, LoRALayer):
+ def __init__(self, in_channels, out_channels, kernel_size, r=0, lora_alpha=1, lora_dropout=0., merge_weights=True, **kwargs):
+ #self.conv = conv_module(in_channels, out_channels, kernel_size, **kwargs)
+ nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
+ LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights)
+ assert isinstance(kernel_size, int)
+
+ # Actual trainable parameters
+ if r > 0:
+ self.lora_A = nn.Parameter(
+ self.weight.new_zeros((r * kernel_size, in_channels * kernel_size))
+ )
+ self.lora_B = nn.Parameter(
+ self.weight.new_zeros((out_channels//self.groups*kernel_size, r*kernel_size))
+ )
+ self.scaling = self.lora_alpha / self.r
+ # Freezing the pre-trained weight matrix
+ self.weight.requires_grad = False
+ self.reset_parameters()
+ self.merged = False
+
+ def reset_parameters(self):
+ #self.conv.reset_parameters()
+ if hasattr(self, 'lora_A'):
+ # initialize A the same way as the default for nn.Linear and B to zero
+ nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+ nn.init.zeros_(self.lora_B)
+
+ # def train(self, mode=True):
+ # super(ConvLoRA, self).train(mode)
+ # if mode:
+ # if self.merge_weights and self.merged:
+ # if self.r > 0:
+ # # Make sure that the weights are not merged
+ # self.conv.weight.data -= (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+ # self.merged = False
+ # else:
+ # if self.merge_weights and not self.merged:
+ # if self.r > 0:
+ # # Merge the weights and mark it
+ # self.conv.weight.data += (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+ # self.merged = True
+
+ def forward(self, x):
+ if self.r > 0 and not self.merged:
+ # return self.conv._conv_forward(
+ # x,
+ # self.conv.weight + (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling,
+ # self.conv.bias
+ # )
+ weight = self.weight + (self.lora_B @ self.lora_A).view(self.weight.shape) * self.scaling
+ bias = self.bias
+
+ return F.conv2d(x, weight, bias=bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
+ else:
+ return F.conv2d(x, self.weight, bias=self.bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
+
+class ConvTransposeLoRA(nn.ConvTranspose2d, LoRALayer):
+ def __init__(self, in_channels, out_channels, kernel_size, r=0, lora_alpha=1, lora_dropout=0., merge_weights=True, **kwargs):
+ #self.conv = conv_module(in_channels, out_channels, kernel_size, **kwargs)
+ nn.ConvTranspose2d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
+ LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights)
+ assert isinstance(kernel_size, int)
+
+ # Actual trainable parameters
+ if r > 0:
+ self.lora_A = nn.Parameter(
+ self.weight.new_zeros((r * kernel_size, in_channels * kernel_size))
+ )
+ self.lora_B = nn.Parameter(
+ self.weight.new_zeros((out_channels//self.groups*kernel_size, r*kernel_size))
+ )
+ self.scaling = self.lora_alpha / self.r
+ # Freezing the pre-trained weight matrix
+ self.weight.requires_grad = False
+ self.reset_parameters()
+ self.merged = False
+
+ def reset_parameters(self):
+ #self.conv.reset_parameters()
+ if hasattr(self, 'lora_A'):
+ # initialize A the same way as the default for nn.Linear and B to zero
+ nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+ nn.init.zeros_(self.lora_B)
+
+ # def train(self, mode=True):
+ # super(ConvTransposeLoRA, self).train(mode)
+ # if mode:
+ # if self.merge_weights and self.merged:
+ # if self.r > 0:
+ # # Make sure that the weights are not merged
+ # self.conv.weight.data -= (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+ # self.merged = False
+ # else:
+ # if self.merge_weights and not self.merged:
+ # if self.r > 0:
+ # # Merge the weights and mark it
+ # self.conv.weight.data += (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+ # self.merged = True
+
+ def forward(self, x):
+ if self.r > 0 and not self.merged:
+ weight = self.weight + (self.lora_B @ self.lora_A).view(self.weight.shape) * self.scaling
+ bias = self.bias
+ return F.conv_transpose2d(x, weight,
+ bias=bias, stride=self.stride, padding=self.padding, output_padding=self.output_padding,
+ groups=self.groups, dilation=self.dilation)
+ else:
+ return F.conv_transpose2d(x, self.weight,
+ bias=self.bias, stride=self.stride, padding=self.padding, output_padding=self.output_padding,
+ groups=self.groups, dilation=self.dilation)
+ #return self.conv(x)
+
+class Conv2dLoRA(ConvLoRA):
+ def __init__(self, *args, **kwargs):
+ super(Conv2dLoRA, self).__init__(*args, **kwargs)
+
+class ConvTranspose2dLoRA(ConvTransposeLoRA):
+ def __init__(self, *args, **kwargs):
+ super(ConvTranspose2dLoRA, self).__init__(*args, **kwargs)
+
+
+def compute_depth_expectation(prob, depth_values):
+ depth_values = depth_values.view(*depth_values.shape, 1, 1)
+ depth = torch.sum(prob * depth_values, 1)
+ return depth
+
+def interpolate_float32(x, size=None, scale_factor=None, mode='nearest', align_corners=None):
+ return F.interpolate(x.float(), size=size, scale_factor=scale_factor, mode=mode, align_corners=align_corners)
+
+# def upflow8(flow, mode='bilinear'):
+# new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+# return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
+
+def upflow4(flow, mode='bilinear'):
+ new_size = (4 * flow.shape[2], 4 * flow.shape[3])
+ return F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
+
+def coords_grid(batch, ht, wd):
+ # coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+ coords = (torch.zeros((ht, wd)), torch.zeros((ht, wd)), torch.zeros((ht, wd)), torch.zeros((ht, wd)), torch.zeros((ht, wd)), torch.zeros((ht, wd)))
+ coords = torch.stack(coords[::-1], dim=0).float()
+ return coords[None].repeat(batch, 1, 1, 1)
+
+def norm_normalize(norm_out):
+ min_kappa = 0.01
+ norm_x, norm_y, norm_z, kappa = torch.split(norm_out, 1, dim=1)
+ norm = torch.sqrt(norm_x ** 2.0 + norm_y ** 2.0 + norm_z ** 2.0) + 1e-10
+ kappa = F.elu(kappa) + 1.0 + min_kappa
+ final_out = torch.cat([norm_x / norm, norm_y / norm, norm_z / norm, kappa], dim=1)
+ return final_out
+
+# uncertainty-guided sampling (only used during training)
+@torch.no_grad()
+def sample_points(init_normal, gt_norm_mask, sampling_ratio, beta):
+ device = init_normal.device
+ B, _, H, W = init_normal.shape
+ N = int(sampling_ratio * H * W)
+ beta = beta
+
+ # uncertainty map
+ uncertainty_map = -1 * init_normal[:, -1, :, :] # B, H, W
+
+ # gt_invalid_mask (B, H, W)
+ if gt_norm_mask is not None:
+ gt_invalid_mask = F.interpolate(gt_norm_mask.float(), size=[H, W], mode='nearest')
+ gt_invalid_mask = gt_invalid_mask[:, 0, :, :] < 0.5
+ uncertainty_map[gt_invalid_mask] = -1e4
+
+ # (B, H*W)
+ _, idx = uncertainty_map.view(B, -1).sort(1, descending=True)
+
+ # importance sampling
+ if int(beta * N) > 0:
+ importance = idx[:, :int(beta * N)] # B, beta*N
+
+ # remaining
+ remaining = idx[:, int(beta * N):] # B, H*W - beta*N
+
+ # coverage
+ num_coverage = N - int(beta * N)
+
+ if num_coverage <= 0:
+ samples = importance
+ else:
+ coverage_list = []
+ for i in range(B):
+ idx_c = torch.randperm(remaining.size()[1]) # shuffles "H*W - beta*N"
+ coverage_list.append(remaining[i, :][idx_c[:num_coverage]].view(1, -1)) # 1, N-beta*N
+ coverage = torch.cat(coverage_list, dim=0) # B, N-beta*N
+ samples = torch.cat((importance, coverage), dim=1) # B, N
+
+ else:
+ # remaining
+ remaining = idx[:, :] # B, H*W
+
+ # coverage
+ num_coverage = N
+
+ coverage_list = []
+ for i in range(B):
+ idx_c = torch.randperm(remaining.size()[1]) # shuffles "H*W - beta*N"
+ coverage_list.append(remaining[i, :][idx_c[:num_coverage]].view(1, -1)) # 1, N-beta*N
+ coverage = torch.cat(coverage_list, dim=0) # B, N-beta*N
+ samples = coverage
+
+ # point coordinates
+ rows_int = samples // W # 0 for first row, H-1 for last row
+ rows_float = rows_int / float(H-1) # 0 to 1.0
+ rows_float = (rows_float * 2.0) - 1.0 # -1.0 to 1.0
+
+ cols_int = samples % W # 0 for first column, W-1 for last column
+ cols_float = cols_int / float(W-1) # 0 to 1.0
+ cols_float = (cols_float * 2.0) - 1.0 # -1.0 to 1.0
+
+ point_coords = torch.zeros(B, 1, N, 2)
+ point_coords[:, 0, :, 0] = cols_float # x coord
+ point_coords[:, 0, :, 1] = rows_float # y coord
+ point_coords = point_coords.to(device)
+ return point_coords, rows_int, cols_int
+
+class FlowHead(nn.Module):
+ def __init__(self, input_dim=128, hidden_dim=256, output_dim_depth=2, output_dim_norm=4, tuning_mode=None):
+ super(FlowHead, self).__init__()
+ self.conv1d = Conv2dLoRA(input_dim, hidden_dim // 2, 3, padding=1, r = 8 if tuning_mode == 'lora' else 0)
+ self.conv2d = Conv2dLoRA(hidden_dim // 2, output_dim_depth, 3, padding=1, r = 8 if tuning_mode == 'lora' else 0)
+
+ self.conv1n = Conv2dLoRA(input_dim, hidden_dim // 2, 3, padding=1, r = 8 if tuning_mode == 'lora' else 0)
+ self.conv2n = Conv2dLoRA(hidden_dim // 2, output_dim_norm, 3, padding=1, r = 8 if tuning_mode == 'lora' else 0)
+ self.relu = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ depth = self.conv2d(self.relu(self.conv1d(x)))
+ normal = self.conv2n(self.relu(self.conv1n(x)))
+ return torch.cat((depth, normal), dim=1)
+
+
+class ConvGRU(nn.Module):
+ def __init__(self, hidden_dim, input_dim, kernel_size=3, tuning_mode=None):
+ super(ConvGRU, self).__init__()
+ self.convz = Conv2dLoRA(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2, r = 8 if tuning_mode == 'lora' else 0)
+ self.convr = Conv2dLoRA(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2, r = 8 if tuning_mode == 'lora' else 0)
+ self.convq = Conv2dLoRA(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2, r = 8 if tuning_mode == 'lora' else 0)
+
+ def forward(self, h, cz, cr, cq, *x_list):
+ x = torch.cat(x_list, dim=1)
+ hx = torch.cat([h, x], dim=1)
+
+ z = torch.sigmoid((self.convz(hx) + cz))
+ r = torch.sigmoid((self.convr(hx) + cr))
+ q = torch.tanh((self.convq(torch.cat([r*h, x], dim=1)) + cq))
+
+ # z = torch.sigmoid((self.convz(hx) + cz).float())
+ # r = torch.sigmoid((self.convr(hx) + cr).float())
+ # q = torch.tanh((self.convq(torch.cat([r*h, x], dim=1)) + cq).float())
+
+ h = (1-z) * h + z * q
+ return h
+
+def pool2x(x):
+ return F.avg_pool2d(x, 3, stride=2, padding=1)
+
+def pool4x(x):
+ return F.avg_pool2d(x, 5, stride=4, padding=1)
+
+def interp(x, dest):
+ interp_args = {'mode': 'bilinear', 'align_corners': True}
+ return interpolate_float32(x, dest.shape[2:], **interp_args)
+
+class BasicMultiUpdateBlock(nn.Module):
+ def __init__(self, args, hidden_dims=[], out_dims=2, tuning_mode=None):
+ super().__init__()
+ self.args = args
+ self.n_gru_layers = args.model.decode_head.n_gru_layers # 3
+ self.n_downsample = args.model.decode_head.n_downsample # 3, resolution of the disparity field (1/2^K)
+
+ # self.encoder = BasicMotionEncoder(args)
+ # encoder_output_dim = 128 # if there is corr volume
+ encoder_output_dim = 6 # no corr volume
+
+ self.gru08 = ConvGRU(hidden_dims[2], encoder_output_dim + hidden_dims[1] * (self.n_gru_layers > 1), tuning_mode=tuning_mode)
+ self.gru16 = ConvGRU(hidden_dims[1], hidden_dims[0] * (self.n_gru_layers == 3) + hidden_dims[2], tuning_mode=tuning_mode)
+ self.gru32 = ConvGRU(hidden_dims[0], hidden_dims[1], tuning_mode=tuning_mode)
+ self.flow_head = FlowHead(hidden_dims[2], hidden_dim=2*hidden_dims[2], tuning_mode=tuning_mode)
+ factor = 2**self.n_downsample
+
+ self.mask = nn.Sequential(
+ Conv2dLoRA(hidden_dims[2], hidden_dims[2], 3, padding=1, r = 8 if tuning_mode == 'lora' else 0),
+ nn.ReLU(inplace=True),
+ Conv2dLoRA(hidden_dims[2], (factor**2)*9, 1, padding=0, r = 8 if tuning_mode == 'lora' else 0))
+
+ def forward(self, net, inp, corr=None, flow=None, iter08=True, iter16=True, iter32=True, update=True):
+
+ if iter32:
+ net[2] = self.gru32(net[2], *(inp[2]), pool2x(net[1]))
+ if iter16:
+ if self.n_gru_layers > 2:
+ net[1] = self.gru16(net[1], *(inp[1]), interp(pool2x(net[0]), net[1]), interp(net[2], net[1]))
+ else:
+ net[1] = self.gru16(net[1], *(inp[1]), interp(pool2x(net[0]), net[1]))
+ if iter08:
+ if corr is not None:
+ motion_features = self.encoder(flow, corr)
+ else:
+ motion_features = flow
+ if self.n_gru_layers > 1:
+ net[0] = self.gru08(net[0], *(inp[0]), motion_features, interp(net[1], net[0]))
+ else:
+ net[0] = self.gru08(net[0], *(inp[0]), motion_features)
+
+ if not update:
+ return net
+
+ delta_flow = self.flow_head(net[0])
+
+ # scale mask to balence gradients
+ mask = .25 * self.mask(net[0])
+ return net, mask, delta_flow
+
+class LayerNorm2d(nn.LayerNorm):
+ def __init__(self, dim):
+ super(LayerNorm2d, self).__init__(dim)
+
+ def forward(self, x):
+ x = x.permute(0, 2, 3, 1).contiguous()
+ x = super(LayerNorm2d, self).forward(x)
+ x = x.permute(0, 3, 1, 2).contiguous()
+ return x
+
+class ResidualBlock(nn.Module):
+ def __init__(self, in_planes, planes, norm_fn='group', stride=1, tuning_mode=None):
+ super(ResidualBlock, self).__init__()
+
+ self.conv1 = Conv2dLoRA(in_planes, planes, kernel_size=3, padding=1, stride=stride, r = 8 if tuning_mode == 'lora' else 0)
+ self.conv2 = Conv2dLoRA(planes, planes, kernel_size=3, padding=1, r = 8 if tuning_mode == 'lora' else 0)
+ self.relu = nn.ReLU(inplace=True)
+
+ num_groups = planes // 8
+
+ if norm_fn == 'group':
+ self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+ self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+ if not (stride == 1 and in_planes == planes):
+ self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+
+ elif norm_fn == 'batch':
+ self.norm1 = nn.BatchNorm2d(planes)
+ self.norm2 = nn.BatchNorm2d(planes)
+ if not (stride == 1 and in_planes == planes):
+ self.norm3 = nn.BatchNorm2d(planes)
+
+ elif norm_fn == 'instance':
+ self.norm1 = nn.InstanceNorm2d(planes)
+ self.norm2 = nn.InstanceNorm2d(planes)
+ if not (stride == 1 and in_planes == planes):
+ self.norm3 = nn.InstanceNorm2d(planes)
+
+ elif norm_fn == 'layer':
+ self.norm1 = LayerNorm2d(planes)
+ self.norm2 = LayerNorm2d(planes)
+ if not (stride == 1 and in_planes == planes):
+ self.norm3 = LayerNorm2d(planes)
+
+ elif norm_fn == 'none':
+ self.norm1 = nn.Sequential()
+ self.norm2 = nn.Sequential()
+ if not (stride == 1 and in_planes == planes):
+ self.norm3 = nn.Sequential()
+
+ if stride == 1 and in_planes == planes:
+ self.downsample = None
+
+ else:
+ self.downsample = nn.Sequential(
+ Conv2dLoRA(in_planes, planes, kernel_size=1, stride=stride, r = 8 if tuning_mode == 'lora' else 0), self.norm3)
+
+ def forward(self, x):
+ y = x
+ y = self.conv1(y)
+ y = self.norm1(y)
+ y = self.relu(y)
+ y = self.conv2(y)
+ y = self.norm2(y)
+ y = self.relu(y)
+
+ if self.downsample is not None:
+ x = self.downsample(x)
+
+ return self.relu(x+y)
+
+
+class ContextFeatureEncoder(nn.Module):
+ '''
+ Encoder features are used to:
+ 1. initialize the hidden state of the update operator
+ 2. and also injected into the GRU during each iteration of the update operator
+ '''
+ def __init__(self, in_dim, output_dim, tuning_mode=None):
+ '''
+ in_dim = [x4, x8, x16, x32]
+ output_dim = [hindden_dims, context_dims]
+ [[x4,x8,x16,x32],[x4,x8,x16,x32]]
+ '''
+ super().__init__()
+
+ output_list = []
+ for dim in output_dim:
+ conv_out = nn.Sequential(
+ ResidualBlock(in_dim[0], dim[0], 'layer', stride=1, tuning_mode=tuning_mode),
+ Conv2dLoRA(dim[0], dim[0], 3, padding=1, r = 8 if tuning_mode == 'lora' else 0))
+ output_list.append(conv_out)
+
+ self.outputs04 = nn.ModuleList(output_list)
+
+ output_list = []
+ for dim in output_dim:
+ conv_out = nn.Sequential(
+ ResidualBlock(in_dim[1], dim[1], 'layer', stride=1, tuning_mode=tuning_mode),
+ Conv2dLoRA(dim[1], dim[1], 3, padding=1, r = 8 if tuning_mode == 'lora' else 0))
+ output_list.append(conv_out)
+
+ self.outputs08 = nn.ModuleList(output_list)
+
+ output_list = []
+ for dim in output_dim:
+ conv_out = nn.Sequential(
+ ResidualBlock(in_dim[2], dim[2], 'layer', stride=1, tuning_mode=tuning_mode),
+ Conv2dLoRA(dim[2], dim[2], 3, padding=1, r = 8 if tuning_mode == 'lora' else 0))
+ output_list.append(conv_out)
+
+ self.outputs16 = nn.ModuleList(output_list)
+
+ # output_list = []
+ # for dim in output_dim:
+ # conv_out = Conv2dLoRA(in_dim[3], dim[3], 3, padding=1)
+ # output_list.append(conv_out)
+
+ # self.outputs32 = nn.ModuleList(output_list)
+
+ def forward(self, encoder_features):
+ x_4, x_8, x_16, x_32 = encoder_features
+
+ outputs04 = [f(x_4) for f in self.outputs04]
+ outputs08 = [f(x_8) for f in self.outputs08]
+ outputs16 = [f(x_16)for f in self.outputs16]
+ # outputs32 = [f(x_32) for f in self.outputs32]
+
+ return (outputs04, outputs08, outputs16)
+
+class ConvBlock(nn.Module):
+ # reimplementation of DPT
+ def __init__(self, channels, tuning_mode=None):
+ super(ConvBlock, self).__init__()
+
+ self.act = nn.ReLU(inplace=True)
+ self.conv1 = Conv2dLoRA(
+ channels,
+ channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ r = 8 if tuning_mode == 'lora' else 0
+ )
+ self.conv2 = Conv2dLoRA(
+ channels,
+ channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ r = 8 if tuning_mode == 'lora' else 0
+ )
+
+ def forward(self, x):
+ out = self.act(x)
+ out = self.conv1(out)
+ out = self.act(out)
+ out = self.conv2(out)
+ return x + out
+
+class FuseBlock(nn.Module):
+ # reimplementation of DPT
+ def __init__(self, in_channels, out_channels, fuse=True, upsample=True, scale_factor=2, tuning_mode=None):
+ super(FuseBlock, self).__init__()
+
+ self.fuse = fuse
+ self.scale_factor = scale_factor
+ self.way_trunk = ConvBlock(in_channels, tuning_mode=tuning_mode)
+ if self.fuse:
+ self.way_branch = ConvBlock(in_channels, tuning_mode=tuning_mode)
+
+ self.out_conv = Conv2dLoRA(
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ r = 8 if tuning_mode == 'lora' else 0
+ )
+ self.upsample = upsample
+
+ def forward(self, x1, x2=None):
+ if x2 is not None:
+ x2 = self.way_branch(x2)
+ x1 = x1 + x2
+
+ out = self.way_trunk(x1)
+
+ if self.upsample:
+ out = interpolate_float32(
+ out, scale_factor=self.scale_factor, mode="bilinear", align_corners=True
+ )
+ out = self.out_conv(out)
+ return out
+
+class Readout(nn.Module):
+ # From DPT
+ def __init__(self, in_features, use_cls_token=True, num_register_tokens=0, tuning_mode=None):
+ super(Readout, self).__init__()
+ self.use_cls_token = use_cls_token
+ if self.use_cls_token == True:
+ self.project_patch = LoRALinear(in_features, in_features, r = 8 if tuning_mode == 'lora' else 0)
+ self.project_learn = LoRALinear((1 + num_register_tokens) * in_features, in_features, bias=False, r = 8 if tuning_mode == 'lora' else 0)
+ self.act = nn.GELU()
+ else:
+ self.project = nn.Identity()
+
+ def forward(self, x):
+
+ if self.use_cls_token == True:
+ x_patch = self.project_patch(x[0])
+ x_learn = self.project_learn(x[1])
+ x_learn = x_learn.expand_as(x_patch).contiguous()
+ features = x_patch + x_learn
+ return self.act(features)
+ else:
+ return self.project(x)
+
+class Token2Feature(nn.Module):
+ # From DPT
+ def __init__(self, vit_channel, feature_channel, scale_factor, use_cls_token=True, num_register_tokens=0, tuning_mode=None):
+ super(Token2Feature, self).__init__()
+ self.scale_factor = scale_factor
+ self.readoper = Readout(in_features=vit_channel, use_cls_token=use_cls_token, num_register_tokens=num_register_tokens, tuning_mode=tuning_mode)
+ if scale_factor > 1 and isinstance(scale_factor, int):
+ self.sample = ConvTranspose2dLoRA(r = 8 if tuning_mode == 'lora' else 0,
+ in_channels=vit_channel,
+ out_channels=feature_channel,
+ kernel_size=scale_factor,
+ stride=scale_factor,
+ padding=0,
+ )
+
+ elif scale_factor > 1:
+ self.sample = nn.Sequential(
+ # Upsample2(upscale=scale_factor),
+ # nn.Upsample(scale_factor=scale_factor),
+ Conv2dLoRA(r = 8 if tuning_mode == 'lora' else 0,
+ in_channels=vit_channel,
+ out_channels=feature_channel,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ),
+ )
+
+
+ elif scale_factor < 1:
+ scale_factor = int(1.0 / scale_factor)
+ self.sample = Conv2dLoRA(r = 8 if tuning_mode == 'lora' else 0,
+ in_channels=vit_channel,
+ out_channels=feature_channel,
+ kernel_size=scale_factor+1,
+ stride=scale_factor,
+ padding=1,
+ )
+
+ else:
+ self.sample = nn.Identity()
+
+ def forward(self, x):
+ x = self.readoper(x)
+ #if use_cls_token == True:
+ x = x.permute(0, 3, 1, 2).contiguous()
+ if isinstance(self.scale_factor, float):
+ x = interpolate_float32(x.float(), scale_factor=self.scale_factor, mode='nearest')
+ x = self.sample(x)
+ return x
+
+class EncoderFeature(nn.Module):
+ def __init__(self, vit_channel, num_ch_dec=[256, 512, 1024, 1024], use_cls_token=True, num_register_tokens=0, tuning_mode=None):
+ super(EncoderFeature, self).__init__()
+ self.vit_channel = vit_channel
+ self.num_ch_dec = num_ch_dec
+
+ self.read_3 = Token2Feature(self.vit_channel, self.num_ch_dec[3], scale_factor=1, use_cls_token=use_cls_token, num_register_tokens=num_register_tokens, tuning_mode=tuning_mode)
+ self.read_2 = Token2Feature(self.vit_channel, self.num_ch_dec[2], scale_factor=1, use_cls_token=use_cls_token, num_register_tokens=num_register_tokens, tuning_mode=tuning_mode)
+ self.read_1 = Token2Feature(self.vit_channel, self.num_ch_dec[1], scale_factor=2, use_cls_token=use_cls_token, num_register_tokens=num_register_tokens, tuning_mode=tuning_mode)
+ self.read_0 = Token2Feature(self.vit_channel, self.num_ch_dec[0], scale_factor=7/2, use_cls_token=use_cls_token, num_register_tokens=num_register_tokens, tuning_mode=tuning_mode)
+
+ def forward(self, ref_feature):
+ x = self.read_3(ref_feature[3]) # 1/14
+ x2 = self.read_2(ref_feature[2]) # 1/14
+ x1 = self.read_1(ref_feature[1]) # 1/7
+ x0 = self.read_0(ref_feature[0]) # 1/4
+
+ return x, x2, x1, x0
+
+class DecoderFeature(nn.Module):
+ def __init__(self, vit_channel, num_ch_dec=[128, 256, 512, 1024, 1024], use_cls_token=True, tuning_mode=None):
+ super(DecoderFeature, self).__init__()
+ self.vit_channel = vit_channel
+ self.num_ch_dec = num_ch_dec
+
+ self.upconv_3 = FuseBlock(
+ self.num_ch_dec[4],
+ self.num_ch_dec[3],
+ fuse=False, upsample=False, tuning_mode=tuning_mode)
+
+ self.upconv_2 = FuseBlock(
+ self.num_ch_dec[3],
+ self.num_ch_dec[2],
+ tuning_mode=tuning_mode)
+
+ self.upconv_1 = FuseBlock(
+ self.num_ch_dec[2],
+ self.num_ch_dec[1] + 2,
+ scale_factor=7/4,
+ tuning_mode=tuning_mode)
+
+ # self.upconv_0 = FuseBlock(
+ # self.num_ch_dec[1],
+ # self.num_ch_dec[0] + 1,
+ # )
+
+ def forward(self, ref_feature):
+ x, x2, x1, x0 = ref_feature # 1/14 1/14 1/7 1/4
+
+ x = self.upconv_3(x) # 1/14
+ x = self.upconv_2(x, x2) # 1/7
+ x = self.upconv_1(x, x1) # 1/4
+ # x = self.upconv_0(x, x0) # 4/7
+ return x
+
+class RAFTDepthNormalDPT5(nn.Module):
+ def __init__(self, cfg):
+ super().__init__()
+ self.in_channels = cfg.model.decode_head.in_channels # [1024, 1024, 1024, 1024]
+ self.feature_channels = cfg.model.decode_head.feature_channels # [256, 512, 1024, 1024] [2/7, 1/7, 1/14, 1/14]
+ self.decoder_channels = cfg.model.decode_head.decoder_channels # [128, 256, 512, 1024, 1024] [-, 1/4, 1/7, 1/14, 1/14]
+ self.use_cls_token = cfg.model.decode_head.use_cls_token
+ self.up_scale = cfg.model.decode_head.up_scale
+ self.num_register_tokens = cfg.model.decode_head.num_register_tokens
+ self.min_val = cfg.data_basic.depth_normalize[0]
+ self.max_val = cfg.data_basic.depth_normalize[1]
+ self.regress_scale = 100.0\
+
+ try:
+ tuning_mode = cfg.model.decode_head.tuning_mode
+ except:
+ tuning_mode = None
+ self.tuning_mode = tuning_mode
+
+ self.hidden_dims = self.context_dims = cfg.model.decode_head.hidden_channels # [128, 128, 128, 128]
+ self.n_gru_layers = cfg.model.decode_head.n_gru_layers # 3
+ self.n_downsample = cfg.model.decode_head.n_downsample # 3, resolution of the disparity field (1/2^K)
+ self.iters = cfg.model.decode_head.iters # 22
+ self.slow_fast_gru = cfg.model.decode_head.slow_fast_gru # True
+
+ self.num_depth_regressor_anchor = 256 # 512
+ self.used_res_channel = self.decoder_channels[1] # now, use 2/7 res
+ self.token2feature = EncoderFeature(self.in_channels[0], self.feature_channels, self.use_cls_token, self.num_register_tokens, tuning_mode=tuning_mode)
+ self.decoder_mono = DecoderFeature(self.in_channels, self.decoder_channels, tuning_mode=tuning_mode)
+ self.depth_regressor = nn.Sequential(
+ Conv2dLoRA(self.used_res_channel,
+ self.num_depth_regressor_anchor,
+ kernel_size=3,
+ padding=1, r = 8 if tuning_mode == 'lora' else 0),
+ # nn.BatchNorm2d(self.num_depth_regressor_anchor),
+ nn.ReLU(inplace=True),
+ Conv2dLoRA(self.num_depth_regressor_anchor,
+ self.num_depth_regressor_anchor,
+ kernel_size=1, r = 8 if tuning_mode == 'lora' else 0),
+ )
+ self.normal_predictor = nn.Sequential(
+ Conv2dLoRA(self.used_res_channel,
+ 128,
+ kernel_size=3,
+ padding=1, r = 8 if tuning_mode == 'lora' else 0,),
+ # nn.BatchNorm2d(128),
+ nn.ReLU(inplace=True),
+ Conv2dLoRA(128, 128, kernel_size=1, r = 8 if tuning_mode == 'lora' else 0), nn.ReLU(inplace=True),
+ Conv2dLoRA(128, 128, kernel_size=1, r = 8 if tuning_mode == 'lora' else 0), nn.ReLU(inplace=True),
+ Conv2dLoRA(128, 3, kernel_size=1, r = 8 if tuning_mode == 'lora' else 0),
+ )
+
+ self.context_feature_encoder = ContextFeatureEncoder(self.feature_channels, [self.hidden_dims, self.context_dims], tuning_mode=tuning_mode)
+ self.context_zqr_convs = nn.ModuleList([Conv2dLoRA(self.context_dims[i], self.hidden_dims[i]*3, 3, padding=3//2, r = 8 if tuning_mode == 'lora' else 0) for i in range(self.n_gru_layers)])
+ self.update_block = BasicMultiUpdateBlock(cfg, hidden_dims=self.hidden_dims, out_dims=6, tuning_mode=tuning_mode)
+
+ self.relu = nn.ReLU(inplace=True)
+
+ def get_bins(self, bins_num):
+ depth_bins_vec = torch.linspace(math.log(self.min_val), math.log(self.max_val), bins_num, device=next(self.parameters()).device)
+ depth_bins_vec = torch.exp(depth_bins_vec)
+ return depth_bins_vec
+
+ def register_depth_expectation_anchor(self, bins_num, B):
+ depth_bins_vec = self.get_bins(bins_num)
+ depth_bins_vec = depth_bins_vec.unsqueeze(0).repeat(B, 1)
+ self.register_buffer('depth_expectation_anchor', depth_bins_vec, persistent=False)
+
+ def clamp(self, x):
+ y = self.relu(x - self.min_val) + self.min_val
+ y = self.max_val - self.relu(self.max_val - y)
+ return y
+
+ def regress_depth(self, feature_map_d):
+ prob_feature = self.depth_regressor(feature_map_d)
+ prob = prob_feature.softmax(dim=1)
+ #prob = prob_feature.float().softmax(dim=1)
+
+ ## Error logging
+ if torch.isnan(prob).any():
+ print('prob_feat_nan!!!')
+ if torch.isinf(prob).any():
+ print('prob_feat_inf!!!')
+
+ # h = prob[0,:,0,0].cpu().numpy().reshape(-1)
+ # import matplotlib.pyplot as plt
+ # plt.bar(range(len(h)), h)
+ B = prob.shape[0]
+ if "depth_expectation_anchor" not in self._buffers:
+ self.register_depth_expectation_anchor(self.num_depth_regressor_anchor, B)
+ d = compute_depth_expectation(
+ prob,
+ self.depth_expectation_anchor[:B, ...]).unsqueeze(1)
+
+ ## Error logging
+ if torch.isnan(d ).any():
+ print('d_nan!!!')
+ if torch.isinf(d ).any():
+ print('d_inf!!!')
+
+ return (self.clamp(d) - self.max_val)/ self.regress_scale, prob_feature
+
+ def pred_normal(self, feature_map, confidence):
+ normal_out = self.normal_predictor(feature_map)
+
+ ## Error logging
+ if torch.isnan(normal_out).any():
+ print('norm_nan!!!')
+ if torch.isinf(normal_out).any():
+ print('norm_feat_inf!!!')
+
+ return norm_normalize(torch.cat([normal_out, confidence], dim=1))
+ #return norm_normalize(torch.cat([normal_out, confidence], dim=1).float())
+
+ def create_mesh_grid(self, height, width, batch, device="cuda", set_buffer=True):
+ y, x = torch.meshgrid([torch.arange(0, height, dtype=torch.float32, device=device),
+ torch.arange(0, width, dtype=torch.float32, device=device)], indexing='ij')
+ meshgrid = torch.stack((x, y))
+ meshgrid = meshgrid.unsqueeze(0).repeat(batch, 1, 1, 1)
+ #self.register_buffer('meshgrid', meshgrid, persistent=False)
+ return meshgrid
+
+ def upsample_flow(self, flow, mask):
+ """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
+ N, D, H, W = flow.shape
+ factor = 2 ** self.n_downsample
+ mask = mask.view(N, 1, 9, factor, factor, H, W)
+ mask = torch.softmax(mask, dim=2)
+ #mask = torch.softmax(mask.float(), dim=2)
+
+ #up_flow = F.unfold(factor * flow, [3,3], padding=1)
+ up_flow = F.unfold(flow, [3,3], padding=1)
+ up_flow = up_flow.view(N, D, 9, 1, 1, H, W)
+
+ up_flow = torch.sum(mask * up_flow, dim=2)
+ up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+ return up_flow.reshape(N, D, factor*H, factor*W)
+
+ def initialize_flow(self, img):
+ """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
+ N, _, H, W = img.shape
+
+ coords0 = coords_grid(N, H, W).to(img.device)
+ coords1 = coords_grid(N, H, W).to(img.device)
+
+ return coords0, coords1
+
+ def upsample(self, x, scale_factor=2):
+ """Upsample input tensor by a factor of 2
+ """
+ return interpolate_float32(x, scale_factor=scale_factor*self.up_scale/8, mode="nearest")
+
+ def forward(self, vit_features, **kwargs):
+ ## read vit token to multi-scale features
+ B, H, W, _, _, num_register_tokens = vit_features[1]
+ vit_features = vit_features[0]
+
+ ## Error logging
+ if torch.isnan(vit_features[0]).any():
+ print('vit_feature_nan!!!')
+ if torch.isinf(vit_features[0]).any():
+ print('vit_feature_inf!!!')
+
+ if self.use_cls_token == True:
+ vit_features = [[ft[:, 1+num_register_tokens:, :].view(B, H, W, self.in_channels[0]), \
+ ft[:, 0:1+num_register_tokens, :].view(B, 1, 1, self.in_channels[0] * (1+num_register_tokens))] for ft in vit_features]
+ else:
+ vit_features = [ft.view(B, H, W, self.in_channels[0]) for ft in vit_features]
+ encoder_features = self.token2feature(vit_features) # 1/14, 1/14, 1/7, 1/4
+
+ ## Error logging
+ for en_ft in encoder_features:
+ if torch.isnan(en_ft).any():
+ print('decoder_feature_nan!!!')
+ print(en_ft.shape)
+ if torch.isinf(en_ft).any():
+ print('decoder_feature_inf!!!')
+ print(en_ft.shape)
+
+ ## decode features to init-depth (and confidence)
+ ref_feat= self.decoder_mono(encoder_features) # now, 1/4 for depth
+
+ ## Error logging
+ if torch.isnan(ref_feat).any():
+ print('ref_feat_nan!!!')
+ if torch.isinf(ref_feat).any():
+ print('ref_feat_inf!!!')
+
+ feature_map = ref_feat[:, :-2, :, :] # feature map share of depth and normal prediction
+ depth_confidence_map = ref_feat[:, -2:-1, :, :]
+ normal_confidence_map = ref_feat[:, -1:, :, :]
+ depth_pred, binmap = self.regress_depth(feature_map) # regress bin for depth
+ normal_pred = self.pred_normal(feature_map, normal_confidence_map) # mlp for normal
+
+ depth_init = torch.cat((depth_pred, depth_confidence_map, normal_pred), dim=1) # (N, 1+1+4, H, W)
+
+ ## encoder features to context-feature for init-hidden-state and contex-features
+ cnet_list = self.context_feature_encoder(encoder_features[::-1])
+ net_list = [torch.tanh(x[0]) for x in cnet_list] # x_4, x_8, x_16 of hidden state
+ inp_list = [torch.relu(x[1]) for x in cnet_list] # x_4, x_8, x_16 context features
+
+ # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning
+ inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)]
+
+ coords0, coords1 = self.initialize_flow(net_list[0])
+ if depth_init is not None:
+ coords1 = coords1 + depth_init
+
+ if self.training:
+ low_resolution_init = [self.clamp(depth_init[:,:1] * self.regress_scale + self.max_val), depth_init[:,1:2], norm_normalize(depth_init[:,2:].clone())]
+ init_depth = upflow4(depth_init)
+ flow_predictions = [self.clamp(init_depth[:,:1] * self.regress_scale + self.max_val)]
+ conf_predictions = [init_depth[:,1:2]]
+ normal_outs = [norm_normalize(init_depth[:,2:].clone())]
+
+ else:
+ flow_predictions = []
+ conf_predictions = []
+ samples_pred_list = []
+ coord_list = []
+ normal_outs = []
+ low_resolution_init = []
+
+ for itr in range(self.iters):
+ # coords1 = coords1.detach()
+ flow = coords1 - coords0
+ if self.n_gru_layers == 3 and self.slow_fast_gru: # Update low-res GRU
+ net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False)
+ if self.n_gru_layers >= 2 and self.slow_fast_gru:# Update low-res GRU and mid-res GRU
+ net_list = self.update_block(net_list, inp_list, iter32=self.n_gru_layers==3, iter16=True, iter08=False, update=False)
+ net_list, up_mask, delta_flow = self.update_block(net_list, inp_list, None, flow, iter32=self.n_gru_layers==3, iter16=self.n_gru_layers>=2)
+
+ # F(t+1) = F(t) + \Delta(t)
+ coords1 = coords1 + delta_flow
+
+ # We do not need to upsample or output intermediate results in test_mode
+ #if (not self.training) and itr < self.iters-1:
+ #continue
+
+ # upsample predictions
+ if up_mask is None:
+ flow_up = self.upsample(coords1-coords0, 4)
+ else:
+ flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+ # flow_up = self.upsample(coords1-coords0, 4)
+
+ flow_predictions.append(self.clamp(flow_up[:,:1] * self.regress_scale + self.max_val))
+ conf_predictions.append(flow_up[:,1:2])
+ normal_outs.append(norm_normalize(flow_up[:,2:].clone()))
+
+ outputs=dict(
+ prediction=flow_predictions[-1],
+ predictions_list=flow_predictions,
+ confidence=conf_predictions[-1],
+ confidence_list=conf_predictions,
+ pred_logit=None,
+ # samples_pred_list=samples_pred_list,
+ # coord_list=coord_list,
+ prediction_normal=normal_outs[-1],
+ normal_out_list=normal_outs,
+ low_resolution_init=low_resolution_init,
+ )
+
+ return outputs
+
+
+if __name__ == "__main__":
+ try:
+ from custom_mmpkg.custom_mmcv.utils import Config
+ except:
+ from mmengine import Config
+ cfg = Config.fromfile('/cpfs01/shared/public/users/mu.hu/monodepth/mono/configs/RAFTDecoder/vit.raft.full2t.py')
+ cfg.model.decode_head.in_channels = [384, 384, 384, 384]
+ cfg.model.decode_head.feature_channels = [96, 192, 384, 768]
+ cfg.model.decode_head.decoder_channels = [48, 96, 192, 384, 384]
+ cfg.model.decode_head.hidden_channels = [48, 48, 48, 48, 48]
+ cfg.model.decode_head.up_scale = 7
+
+ # cfg.model.decode_head.use_cls_token = True
+ # vit_feature = [[torch.rand((2, 20, 60, 384)).cuda(), torch.rand(2, 384).cuda()], \
+ # [torch.rand((2, 20, 60, 384)).cuda(), torch.rand(2, 384).cuda()], \
+ # [torch.rand((2, 20, 60, 384)).cuda(), torch.rand(2, 384).cuda()], \
+ # [torch.rand((2, 20, 60, 384)).cuda(), torch.rand(2, 384).cuda()]]
+
+ cfg.model.decode_head.use_cls_token = True
+ cfg.model.decode_head.num_register_tokens = 4
+ vit_feature = [[torch.rand((2, (74 * 74) + 5, 384)).cuda(),\
+ torch.rand((2, (74 * 74) + 5, 384)).cuda(), \
+ torch.rand((2, (74 * 74) + 5, 384)).cuda(), \
+ torch.rand((2, (74 * 74) + 5, 384)).cuda()], (2, 74, 74, 1036, 1036, 4)]
+
+ decoder = RAFTDepthNormalDPT5(cfg).cuda()
+ output = decoder(vit_feature)
+ temp = 1
+
+
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e696288864e9b4487b28ccce1e749cea9811491
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/decode_heads/__init__.py
@@ -0,0 +1,4 @@
+from .HourGlassDecoder import HourglassDecoder
+from .RAFTDepthNormalDPTDecoder5 import RAFTDepthNormalDPT5
+
+__all__=['HourglassDecoder', 'RAFTDepthNormalDPT5']
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/__base_model__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/__base_model__.py
new file mode 100644
index 0000000000000000000000000000000000000000..302f969be25305a0766bba4adb31a73d3b22b41b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/__base_model__.py
@@ -0,0 +1,20 @@
+import torch
+import torch.nn as nn
+from custom_controlnet_aux.metric3d.mono.utils.comm import get_func
+
+
+class BaseDepthModel(nn.Module):
+ def __init__(self, cfg, **kwargs) -> None:
+ super(BaseDepthModel, self).__init__()
+ model_type = cfg.model.type
+ self.depth_model = get_func('custom_controlnet_aux.metric3d.mono.model.model_pipelines.' + model_type)(cfg)
+
+ def forward(self, data):
+ output = self.depth_model(**data)
+
+ return output['prediction'], output['confidence'], output
+
+ def inference(self, data):
+ with torch.no_grad():
+ pred_depth, confidence, _ = self.forward(data)
+ return pred_depth, confidence
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..722280c2f26d8cb06247469346dff7cd18234ee2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/__init__.py
@@ -0,0 +1,6 @@
+
+from .dense_pipeline import DensePredModel
+from .__base_model__ import BaseDepthModel
+__all__ = [
+ 'DensePredModel', 'BaseDepthModel',
+]
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/dense_pipeline.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/dense_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..62bc5582b4da080c2bb0a8992af85c0fba75082a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/model_pipelines/dense_pipeline.py
@@ -0,0 +1,16 @@
+import torch
+import torch.nn as nn
+from custom_controlnet_aux.metric3d.mono.utils.comm import get_func
+
+class DensePredModel(nn.Module):
+ def __init__(self, cfg) -> None:
+ super(DensePredModel, self).__init__()
+
+ self.encoder = get_func('custom_controlnet_aux.metric3d.mono.model.' + cfg.model.backbone.prefix + cfg.model.backbone.type)(**cfg.model.backbone)
+ self.decoder = get_func('custom_controlnet_aux.metric3d.mono.model.' + cfg.model.decode_head.prefix + cfg.model.decode_head.type)(cfg)
+
+ def forward(self, input, **kwargs):
+ # [f_32, f_16, f_8, f_4]
+ features = self.encoder(input)
+ out = self.decoder(features, **kwargs)
+ return out
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/monodepth_model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/monodepth_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0592970c5fcfc3cac8446b9e833478a0dcbda35
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/model/monodepth_model.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+from .model_pipelines.__base_model__ import BaseDepthModel
+
+class DepthModel(BaseDepthModel):
+ def __init__(self, cfg, **kwards):
+ super(DepthModel, self).__init__(cfg)
+ model_type = cfg.model.type
+
+ def inference(self, data):
+ with torch.no_grad():
+ pred_depth, confidence, output_dict = self.forward(data)
+ return pred_depth, confidence, output_dict
+
+def get_monodepth_model(
+ cfg : dict,
+ **kwargs
+ ) -> nn.Module:
+ # config depth model
+ model = DepthModel(cfg, **kwargs)
+ #model.init_weights(load_imagenet_model, imagenet_ckpt_fpath)
+ assert isinstance(model, nn.Module)
+ return model
+
+def get_configured_monodepth_model(
+ cfg: dict,
+ ) -> nn.Module:
+ """
+ Args:
+ @ configs: configures for the network.
+ @ load_imagenet_model: whether to initialize from ImageNet-pretrained model.
+ @ imagenet_ckpt_fpath: string representing path to file with weights to initialize model with.
+ Returns:
+ # model: depth model.
+ """
+ model = get_monodepth_model(cfg)
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/tools/test_scale_cano.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/tools/test_scale_cano.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ef7d69fc0ed61c3e640fd1dc10abe40b54122d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/tools/test_scale_cano.py
@@ -0,0 +1,161 @@
+import os
+import os.path as osp
+import cv2
+import time
+import sys
+CODE_SPACE=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(CODE_SPACE)
+import argparse
+import custom_mmpkg.custom_mmcv as mmcv
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+try:
+ from custom_mmpkg.custom_mmcv.utils import Config, DictAction
+except:
+ from mmengine import Config, DictAction
+from datetime import timedelta
+import random
+import numpy as np
+from custom_controlnet_aux.metric3d.mono.utils.logger import setup_logger
+import glob
+from custom_controlnet_aux.metric3d.mono.utils.comm import init_env
+from custom_controlnet_aux.metric3d.mono.model.monodepth_model import get_configured_monodepth_model
+from custom_controlnet_aux.metric3d.mono.utils.running import load_ckpt
+from custom_controlnet_aux.metric3d.mono.utils.do_test import do_scalecano_test_with_custom_data
+from custom_controlnet_aux.metric3d.mono.utils.mldb import load_data_info, reset_ckpt_path
+from custom_controlnet_aux.metric3d.mono.utils.custom_data import load_from_annos, load_data
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a segmentor')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--show-dir', help='the dir to save logs and visualization results')
+ parser.add_argument('--load-from', help='the checkpoint file to load weights from')
+ parser.add_argument('--node_rank', type=int, default=0)
+ parser.add_argument('--nnodes', type=int, default=1, help='number of nodes')
+ parser.add_argument('--options', nargs='+', action=DictAction, help='custom options')
+ parser.add_argument('--launcher', choices=['None', 'pytorch', 'slurm', 'mpi', 'ror'], default='slurm', help='job launcher')
+ parser.add_argument('--test_data_path', default='None', type=str, help='the path of test data')
+ parser.add_argument('--batch_size', default=1, type=int, help='the batch size for inference')
+ args = parser.parse_args()
+ return args
+
+def main(args):
+ os.chdir(CODE_SPACE)
+ cfg = Config.fromfile(args.config)
+
+ if args.options is not None:
+ cfg.merge_from_dict(args.options)
+
+ # show_dir is determined in this priority: CLI > segment in file > filename
+ if args.show_dir is not None:
+ # update configs according to CLI args if args.show_dir is not None
+ cfg.show_dir = args.show_dir
+ else:
+ # use condig filename + timestamp as default show_dir if args.show_dir is None
+ cfg.show_dir = osp.join('./show_dirs',
+ osp.splitext(osp.basename(args.config))[0],
+ args.timestamp)
+
+ # ckpt path
+ if args.load_from is None:
+ raise RuntimeError('Please set model path!')
+ cfg.load_from = args.load_from
+ cfg.batch_size = args.batch_size
+
+ # load data info
+ data_info = {}
+ load_data_info('data_info', data_info=data_info)
+ cfg.mldb_info = data_info
+ # update check point info
+ reset_ckpt_path(cfg.model, data_info)
+
+ # create show dir
+ os.makedirs(osp.abspath(cfg.show_dir), exist_ok=True)
+
+ # init the logger before other steps
+ cfg.log_file = osp.join(cfg.show_dir, f'{args.timestamp}.log')
+ logger = setup_logger(cfg.log_file)
+
+ # log some basic info
+ logger.info(f'Config:\n{cfg.pretty_text}')
+
+ # init distributed env dirst, since logger depends on the dist info
+ if args.launcher == 'None':
+ cfg.distributed = False
+ else:
+ cfg.distributed = True
+ init_env(args.launcher, cfg)
+ logger.info(f'Distributed training: {cfg.distributed}')
+
+ # dump config
+ cfg.dump(osp.join(cfg.show_dir, osp.basename(args.config)))
+ test_data_path = args.test_data_path
+ if not os.path.isabs(test_data_path):
+ test_data_path = osp.join(CODE_SPACE, test_data_path)
+
+ if 'json' in test_data_path:
+ test_data = load_from_annos(test_data_path)
+ else:
+ test_data = load_data(args.test_data_path)
+
+ if not cfg.distributed:
+ main_worker(0, cfg, args.launcher, test_data)
+ else:
+ # distributed training
+ if args.launcher == 'ror':
+ local_rank = cfg.dist_params.local_rank
+ main_worker(local_rank, cfg, args.launcher, test_data)
+ else:
+ mp.spawn(main_worker, nprocs=cfg.dist_params.num_gpus_per_node, args=(cfg, args.launcher, test_data))
+
+def main_worker(local_rank: int, cfg: dict, launcher: str, test_data: list):
+ if cfg.distributed:
+ cfg.dist_params.global_rank = cfg.dist_params.node_rank * cfg.dist_params.num_gpus_per_node + local_rank
+ cfg.dist_params.local_rank = local_rank
+
+ if launcher == 'ror':
+ init_torch_process_group(use_hvd=False)
+ else:
+ torch.cuda.set_device(local_rank)
+ default_timeout = timedelta(minutes=30)
+ dist.init_process_group(
+ backend=cfg.dist_params.backend,
+ init_method=cfg.dist_params.dist_url,
+ world_size=cfg.dist_params.world_size,
+ rank=cfg.dist_params.global_rank,
+ timeout=default_timeout)
+
+ logger = setup_logger(cfg.log_file)
+ # build model
+ model = get_configured_monodepth_model(cfg, )
+
+ # config distributed training
+ if cfg.distributed:
+ model = torch.nn.parallel.DistributedDataParallel(model.cuda(),
+ device_ids=[local_rank],
+ output_device=local_rank,
+ find_unused_parameters=True)
+ else:
+ model = torch.nn.DataParallel(model).cuda()
+
+ # load ckpt
+ model, _, _, _ = load_ckpt(cfg.load_from, model, strict_match=False)
+ model.eval()
+
+ do_scalecano_test_with_custom_data(
+ model,
+ cfg,
+ test_data,
+ logger,
+ cfg.distributed,
+ local_rank,
+ cfg.batch_size,
+ )
+
+if __name__ == '__main__':
+ args = parse_args()
+ timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+ args.timestamp = timestamp
+ main(args)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f5a12faa99758192ecc4ed3fc22c9249232e86
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/__init__.py
@@ -0,0 +1 @@
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/avg_meter.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/avg_meter.py
new file mode 100644
index 0000000000000000000000000000000000000000..23c35fb78a26821e4ef1224559749dae7b586849
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/avg_meter.py
@@ -0,0 +1,475 @@
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+
+
+class AverageMeter(object):
+ """Computes and stores the average and current value"""
+ def __init__(self) -> None:
+ self.reset()
+
+ def reset(self) -> None:
+ self.val = np.longdouble(0.0)
+ self.avg = np.longdouble(0.0)
+ self.sum = np.longdouble(0.0)
+ self.count = np.longdouble(0.0)
+
+ def update(self, val, n: float = 1) -> None:
+ self.val = val
+ self.sum += val
+ self.count += n
+ self.avg = self.sum / (self.count + 1e-6)
+
+class MetricAverageMeter(AverageMeter):
+ """
+ An AverageMeter designed specifically for evaluating segmentation results.
+ """
+ def __init__(self, metrics: list) -> None:
+ """ Initialize object. """
+ # average meters for metrics
+ self.abs_rel = AverageMeter()
+ self.rmse = AverageMeter()
+ self.silog = AverageMeter()
+ self.delta1 = AverageMeter()
+ self.delta2 = AverageMeter()
+ self.delta3 = AverageMeter()
+
+ self.metrics = metrics
+
+ self.consistency = AverageMeter()
+ self.log10 = AverageMeter()
+ self.rmse_log = AverageMeter()
+ self.sq_rel = AverageMeter()
+
+ # normal
+ self.normal_mean = AverageMeter()
+ self.normal_rmse = AverageMeter()
+ self.normal_a1 = AverageMeter()
+ self.normal_a2 = AverageMeter()
+
+ self.normal_median = AverageMeter()
+ self.normal_a3 = AverageMeter()
+ self.normal_a4 = AverageMeter()
+ self.normal_a5 = AverageMeter()
+
+
+ def update_metrics_cpu(self,
+ pred: torch.Tensor,
+ target: torch.Tensor,
+ mask: torch.Tensor,):
+ """
+ Update metrics on cpu
+ """
+
+ assert pred.shape == target.shape
+
+ if len(pred.shape) == 3:
+ pred = pred[:, None, :, :]
+ target = target[:, None, :, :]
+ mask = mask[:, None, :, :]
+ elif len(pred.shape) == 2:
+ pred = pred[None, None, :, :]
+ target = target[None, None, :, :]
+ mask = mask[None, None, :, :]
+
+
+ # Absolute relative error
+ abs_rel_sum, valid_pics = get_absrel_err(pred, target, mask)
+ abs_rel_sum = abs_rel_sum.numpy()
+ valid_pics = valid_pics.numpy()
+ self.abs_rel.update(abs_rel_sum, valid_pics)
+
+ # squared relative error
+ sqrel_sum, _ = get_sqrel_err(pred, target, mask)
+ sqrel_sum = sqrel_sum.numpy()
+ self.sq_rel.update(sqrel_sum, valid_pics)
+
+ # root mean squared error
+ rmse_sum, _ = get_rmse_err(pred, target, mask)
+ rmse_sum = rmse_sum.numpy()
+ self.rmse.update(rmse_sum, valid_pics)
+
+ # log root mean squared error
+ log_rmse_sum, _ = get_rmse_log_err(pred, target, mask)
+ log_rmse_sum = log_rmse_sum.numpy()
+ self.rmse.update(log_rmse_sum, valid_pics)
+
+ # log10 error
+ log10_sum, _ = get_log10_err(pred, target, mask)
+ log10_sum = log10_sum.numpy()
+ self.rmse.update(log10_sum, valid_pics)
+
+ # scale-invariant root mean squared error in log space
+ silog_sum, _ = get_silog_err(pred, target, mask)
+ silog_sum = silog_sum.numpy()
+ self.silog.update(silog_sum, valid_pics)
+
+ # ratio error, delta1, ....
+ delta1_sum, delta2_sum, delta3_sum, _ = get_ratio_error(pred, target, mask)
+ delta1_sum = delta1_sum.numpy()
+ delta2_sum = delta2_sum.numpy()
+ delta3_sum = delta3_sum.numpy()
+
+ self.delta1.update(delta1_sum, valid_pics)
+ self.delta2.update(delta1_sum, valid_pics)
+ self.delta3.update(delta1_sum, valid_pics)
+
+
+ def update_metrics_gpu(
+ self,
+ pred: torch.Tensor,
+ target: torch.Tensor,
+ mask: torch.Tensor,
+ is_distributed: bool,
+ pred_next: torch.tensor = None,
+ pose_f1_to_f2: torch.tensor = None,
+ intrinsic: torch.tensor = None):
+ """
+ Update metric on GPU. It supports distributed processing. If multiple machines are employed, please
+ set 'is_distributed' as True.
+ """
+ assert pred.shape == target.shape
+
+ if len(pred.shape) == 3:
+ pred = pred[:, None, :, :]
+ target = target[:, None, :, :]
+ mask = mask[:, None, :, :]
+ elif len(pred.shape) == 2:
+ pred = pred[None, None, :, :]
+ target = target[None, None, :, :]
+ mask = mask[None, None, :, :]
+
+
+ # Absolute relative error
+ abs_rel_sum, valid_pics = get_absrel_err(pred, target, mask)
+ if is_distributed:
+ dist.all_reduce(abs_rel_sum), dist.all_reduce(valid_pics)
+ abs_rel_sum = abs_rel_sum.cpu().numpy()
+ valid_pics = int(valid_pics)
+ self.abs_rel.update(abs_rel_sum, valid_pics)
+
+ # root mean squared error
+ rmse_sum, _ = get_rmse_err(pred, target, mask)
+ if is_distributed:
+ dist.all_reduce(rmse_sum)
+ rmse_sum = rmse_sum.cpu().numpy()
+ self.rmse.update(rmse_sum, valid_pics)
+
+ # log root mean squared error
+ log_rmse_sum, _ = get_rmse_log_err(pred, target, mask)
+ if is_distributed:
+ dist.all_reduce(log_rmse_sum)
+ log_rmse_sum = log_rmse_sum.cpu().numpy()
+ self.rmse_log.update(log_rmse_sum, valid_pics)
+
+ # log10 error
+ log10_sum, _ = get_log10_err(pred, target, mask)
+ if is_distributed:
+ dist.all_reduce(log10_sum)
+ log10_sum = log10_sum.cpu().numpy()
+ self.log10.update(log10_sum, valid_pics)
+
+ # scale-invariant root mean squared error in log space
+ silog_sum, _ = get_silog_err(pred, target, mask)
+ if is_distributed:
+ dist.all_reduce(silog_sum)
+ silog_sum = silog_sum.cpu().numpy()
+ self.silog.update(silog_sum, valid_pics)
+
+ # ratio error, delta1, ....
+ delta1_sum, delta2_sum, delta3_sum, _ = get_ratio_err(pred, target, mask)
+ if is_distributed:
+ dist.all_reduce(delta1_sum), dist.all_reduce(delta2_sum), dist.all_reduce(delta3_sum)
+ delta1_sum = delta1_sum.cpu().numpy()
+ delta2_sum = delta2_sum.cpu().numpy()
+ delta3_sum = delta3_sum.cpu().numpy()
+
+ self.delta1.update(delta1_sum, valid_pics)
+ self.delta2.update(delta2_sum, valid_pics)
+ self.delta3.update(delta3_sum, valid_pics)
+
+ # video consistency error
+ # consistency_rel_sum, valid_warps = get_video_consistency_err(pred, pred_next, pose_f1_to_f2, intrinsic)
+ # if is_distributed:
+ # dist.all_reduce(consistency_rel_sum), dist.all_reduce(valid_warps)
+ # consistency_rel_sum = consistency_rel_sum.cpu().numpy()
+ # valid_warps = int(valid_warps)
+ # self.consistency.update(consistency_rel_sum, valid_warps)
+
+ ## for surface normal
+ def update_normal_metrics_gpu(
+ self,
+ pred: torch.Tensor, # (B, 3, H, W)
+ target: torch.Tensor, # (B, 3, H, W)
+ mask: torch.Tensor, # (B, 1, H, W)
+ is_distributed: bool,
+ ):
+ """
+ Update metric on GPU. It supports distributed processing. If multiple machines are employed, please
+ set 'is_distributed' as True.
+ """
+ assert pred.shape == target.shape
+
+ valid_pics = torch.sum(mask, dtype=torch.float32) + 1e-6
+
+ if valid_pics < 10:
+ return
+
+ mean_error = rmse_error = a1_error = a2_error = dist_node_cnt = valid_pics
+ normal_error = torch.cosine_similarity(pred, target, dim=1)
+ normal_error = torch.clamp(normal_error, min=-1.0, max=1.0)
+ angle_error = torch.acos(normal_error) * 180.0 / torch.pi
+ angle_error = angle_error[:, None, :, :]
+ angle_error = angle_error[mask]
+ # Calculation error
+ mean_error = angle_error.sum() / valid_pics
+ rmse_error = torch.sqrt( torch.sum(torch.square(angle_error)) / valid_pics )
+ median_error = angle_error.median()
+ a1_error = 100.0 * (torch.sum(angle_error < 5) / valid_pics)
+ a2_error = 100.0 * (torch.sum(angle_error < 7.5) / valid_pics)
+
+ a3_error = 100.0 * (torch.sum(angle_error < 11.25) / valid_pics)
+ a4_error = 100.0 * (torch.sum(angle_error < 22.5) / valid_pics)
+ a5_error = 100.0 * (torch.sum(angle_error < 30) / valid_pics)
+
+ # if valid_pics > 1e-5:
+ # If the current node gets data with valid normal
+ dist_node_cnt = (valid_pics - 1e-6) / valid_pics
+
+ if is_distributed:
+ dist.all_reduce(dist_node_cnt)
+ dist.all_reduce(mean_error)
+ dist.all_reduce(rmse_error)
+ dist.all_reduce(a1_error)
+ dist.all_reduce(a2_error)
+
+ dist.all_reduce(a3_error)
+ dist.all_reduce(a4_error)
+ dist.all_reduce(a5_error)
+
+ dist_node_cnt = dist_node_cnt.cpu().numpy()
+ self.normal_mean.update(mean_error.cpu().numpy(), dist_node_cnt)
+ self.normal_rmse.update(rmse_error.cpu().numpy(), dist_node_cnt)
+ self.normal_a1.update(a1_error.cpu().numpy(), dist_node_cnt)
+ self.normal_a2.update(a2_error.cpu().numpy(), dist_node_cnt)
+
+ self.normal_median.update(median_error.cpu().numpy(), dist_node_cnt)
+ self.normal_a3.update(a3_error.cpu().numpy(), dist_node_cnt)
+ self.normal_a4.update(a4_error.cpu().numpy(), dist_node_cnt)
+ self.normal_a5.update(a5_error.cpu().numpy(), dist_node_cnt)
+
+
+ def get_metrics(self,):
+ """
+ """
+ metrics_dict = {}
+ for metric in self.metrics:
+ metrics_dict[metric] = self.__getattribute__(metric).avg
+ return metrics_dict
+
+
+ def get_metrics(self,):
+ """
+ """
+ metrics_dict = {}
+ for metric in self.metrics:
+ metrics_dict[metric] = self.__getattribute__(metric).avg
+ return metrics_dict
+
+def get_absrel_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes absolute relative error.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred * mask
+
+ # Mean Absolute Relative Error
+ rel = torch.abs(t_m - p_m) / (t_m + 1e-10) # compute errors
+ abs_rel_sum = torch.sum(rel.reshape((b, c, -1)), dim=2) # [b, c]
+ num = torch.sum(mask.reshape((b, c, -1)), dim=2) # [b, c]
+ abs_err = abs_rel_sum / (num + 1e-10)
+ valid_pics = torch.sum(num > 0)
+ return torch.sum(abs_err), valid_pics
+
+def get_sqrel_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes squared relative error.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred * mask
+
+ # squared Relative Error
+ sq_rel = torch.abs(t_m - p_m) ** 2 / (t_m + 1e-10) # compute errors
+ sq_rel_sum = torch.sum(sq_rel.reshape((b, c, -1)), dim=2) # [b, c]
+ num = torch.sum(mask.reshape((b, c, -1)), dim=2) # [b, c]
+ sqrel_err = sq_rel_sum / (num + 1e-10)
+ valid_pics = torch.sum(num > 0)
+ return torch.sum(sqrel_err), valid_pics
+
+def get_log10_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes log10 error.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred * mask
+
+ diff_log = (torch.log10(p_m+1e-10) - torch.log10(t_m+1e-10)) * mask
+ log10_diff = torch.abs(diff_log)
+ log10_sum = torch.sum(log10_diff.reshape((b, c, -1)), dim=2) # [b, c]
+ num = torch.sum(mask.reshape((b, c, -1)), dim=2) # [b, c]
+ log10_err = log10_sum / (num + 1e-10)
+ valid_pics = torch.sum(num > 0)
+ return torch.sum(log10_err), valid_pics
+
+def get_rmse_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes rmse error.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred * mask
+
+ square = (t_m - p_m) ** 2
+ rmse_sum = torch.sum(square.reshape((b, c, -1)), dim=2) # [b, c]
+ num = torch.sum(mask.reshape((b, c, -1)), dim=2) # [b, c]
+ rmse = torch.sqrt(rmse_sum / (num + 1e-10))
+ valid_pics = torch.sum(num > 0)
+ return torch.sum(rmse), valid_pics
+
+def get_rmse_log_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes log rmse error.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred * mask
+
+ diff_log = (torch.log10(p_m+1e-10) - torch.log10(t_m+1e-10)) * mask
+ square = diff_log ** 2
+ rmse_log_sum = torch.sum(square.reshape((b, c, -1)), dim=2) # [b, c]
+ num = torch.sum(mask.reshape((b, c, -1)), dim=2) # [b, c]
+ rmse_log = torch.sqrt(rmse_log_sum / (num + 1e-10))
+ valid_pics = torch.sum(num > 0)
+ return torch.sum(rmse_log), valid_pics
+
+def get_silog_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes log rmse error.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred * mask
+
+ diff_log = (torch.log10(p_m+1e-10) - torch.log10(t_m+1e-10)) * mask
+ diff_log_sum = torch.sum(diff_log.reshape((b, c, -1)), dim=2) # [b, c]
+ diff_log_square = diff_log ** 2
+ diff_log_square_sum = torch.sum(diff_log_square.reshape((b, c, -1)), dim=2) # [b, c]
+ num = torch.sum(mask.reshape((b, c, -1)), dim=2) # [b, c]
+ silog = torch.sqrt(diff_log_square_sum / (num + 1e-10) - (diff_log_sum / (num + 1e-10)) ** 2)
+ valid_pics = torch.sum(num > 0)
+ return torch.sum(silog), valid_pics
+
+def get_ratio_err(pred: torch.tensor,
+ target: torch.tensor,
+ mask: torch.tensor,
+ ):
+ """
+ Computes the percentage of pixels for which the ratio of the two depth maps is less than a given threshold.
+ Tasks preprocessed depths (no nans, infs and non-positive values).
+ pred, target, and mask should be in the shape of [b, c, h, w]
+ """
+ assert len(pred.shape) == 4, len(target.shape) == 4
+ b, c, h, w = pred.shape
+ mask = mask.to(torch.float)
+ t_m = target * mask
+ p_m = pred
+
+ gt_pred = t_m / (p_m + 1e-10)
+ pred_gt = p_m / (t_m + 1e-10)
+ gt_pred = gt_pred.reshape((b, c, -1))
+ pred_gt = pred_gt.reshape((b, c, -1))
+ gt_pred_gt = torch.cat((gt_pred, pred_gt), axis=1)
+ ratio_max = torch.amax(gt_pred_gt, axis=1)
+
+ delta_1_sum = torch.sum((ratio_max < 1.25), dim=1) # [b, ]
+ delta_2_sum = torch.sum((ratio_max < 1.25 ** 2), dim=1) # [b, ]
+ delta_3_sum = torch.sum((ratio_max < 1.25 ** 3), dim=1) # [b, ]
+ num = torch.sum(mask.reshape((b, -1)), dim=1) # [b, ]
+
+ delta_1 = delta_1_sum / (num + 1e-10)
+ delta_2 = delta_2_sum / (num + 1e-10)
+ delta_3 = delta_3_sum / (num + 1e-10)
+ valid_pics = torch.sum(num > 0)
+
+ return torch.sum(delta_1), torch.sum(delta_2), torch.sum(delta_3), valid_pics
+
+
+if __name__ == '__main__':
+ cfg = ['abs_rel', 'delta1']
+ dam = MetricAverageMeter(cfg)
+
+ pred_depth = np.random.random([2, 480, 640])
+ gt_depth = np.random.random([2, 480, 640]) - 0.5
+ intrinsic = [[100, 100, 200, 200], [200, 200, 300, 300]]
+
+ pred = torch.from_numpy(pred_depth).cuda()
+ gt = torch.from_numpy(gt_depth).cuda()
+
+ mask = gt > 0
+ dam.update_metrics_gpu(pred, gt, mask, False)
+ eval_error = dam.get_metrics()
+ print(eval_error)
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/comm.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a077807742446a1e387bf5ed4ebfdeedf16399
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/comm.py
@@ -0,0 +1,322 @@
+import importlib
+import torch
+import torch.distributed as dist
+from .avg_meter import AverageMeter
+from collections import defaultdict, OrderedDict
+import os
+import socket
+from custom_mmpkg.custom_mmcv.utils import collect_env as collect_base_env
+try:
+ from custom_mmpkg.custom_mmcv.utils import get_git_hash
+except:
+ from mmengine.utils import get_git_hash
+#import mono.mmseg as mmseg
+# import mmseg
+import time
+import datetime
+import logging
+
+
+def main_process() -> bool:
+ return get_rank() == 0
+ #return not cfg.distributed or \
+ # (cfg.distributed and cfg.local_rank == 0)
+
+def get_world_size() -> int:
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size()
+
+def get_rank() -> int:
+ if not dist.is_available():
+ return 0
+ if not dist.is_initialized():
+ return 0
+ return dist.get_rank()
+
+def _find_free_port():
+ # refer to https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ # Binding to port 0 will cause the OS to find an available port for us
+ sock.bind(('', 0))
+ port = sock.getsockname()[1]
+ sock.close()
+ # NOTE: there is still a chance the port could be taken by other processes.
+ return port
+
+def _is_free_port(port):
+ ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+ ips.append('localhost')
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+
+
+# def collect_env():
+# """Collect the information of the running environments."""
+# env_info = collect_base_env()
+# env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
+
+# return env_info
+
+def init_env(launcher, cfg):
+ """Initialize distributed training environment.
+ If argument ``cfg.dist_params.dist_url`` is specified as 'env://', then the master port will be system
+ environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+ environment variable, then a default port ``29500`` will be used.
+ """
+ if launcher == 'slurm':
+ _init_dist_slurm(cfg)
+ elif launcher == 'ror':
+ _init_dist_ror(cfg)
+ elif launcher == 'None':
+ _init_none_dist(cfg)
+ else:
+ raise RuntimeError(f'{cfg.launcher} has not been supported!')
+
+def _init_none_dist(cfg):
+ cfg.dist_params.num_gpus_per_node = 1
+ cfg.dist_params.world_size = 1
+ cfg.dist_params.nnodes = 1
+ cfg.dist_params.node_rank = 0
+ cfg.dist_params.global_rank = 0
+ cfg.dist_params.local_rank = 0
+ os.environ["WORLD_SIZE"] = str(1)
+
+def _init_dist_ror(cfg):
+ from ac2.ror.comm import get_local_rank, get_world_rank, get_local_size, get_node_rank, get_world_size
+ cfg.dist_params.num_gpus_per_node = get_local_size()
+ cfg.dist_params.world_size = get_world_size()
+ cfg.dist_params.nnodes = (get_world_size()) // (get_local_size())
+ cfg.dist_params.node_rank = get_node_rank()
+ cfg.dist_params.global_rank = get_world_rank()
+ cfg.dist_params.local_rank = get_local_rank()
+ os.environ["WORLD_SIZE"] = str(get_world_size())
+
+
+def _init_dist_slurm(cfg):
+ if 'NNODES' not in os.environ:
+ os.environ['NNODES'] = str(cfg.dist_params.nnodes)
+ if 'NODE_RANK' not in os.environ:
+ os.environ['NODE_RANK'] = str(cfg.dist_params.node_rank)
+
+ #cfg.dist_params.
+ num_gpus = torch.cuda.device_count()
+ world_size = int(os.environ['NNODES']) * num_gpus
+ os.environ['WORLD_SIZE'] = str(world_size)
+
+ # config port
+ if 'MASTER_PORT' in os.environ:
+ master_port = str(os.environ['MASTER_PORT']) # use MASTER_PORT in the environment variable
+ else:
+ # if torch.distributed default port(29500) is available
+ # then use it, else find a free port
+ if _is_free_port(16500):
+ master_port = '16500'
+ else:
+ master_port = str(_find_free_port())
+ os.environ['MASTER_PORT'] = master_port
+
+ # config addr
+ if 'MASTER_ADDR' in os.environ:
+ master_addr = str(os.environ['MASTER_PORT']) # use MASTER_PORT in the environment variable
+ # elif cfg.dist_params.dist_url is not None:
+ # master_addr = ':'.join(str(cfg.dist_params.dist_url).split(':')[:2])
+ else:
+ master_addr = '127.0.0.1' #'tcp://127.0.0.1'
+ os.environ['MASTER_ADDR'] = master_addr
+
+ # set dist_url to 'env://'
+ cfg.dist_params.dist_url = 'env://' #f"{master_addr}:{master_port}"
+
+ cfg.dist_params.num_gpus_per_node = num_gpus
+ cfg.dist_params.world_size = world_size
+ cfg.dist_params.nnodes = int(os.environ['NNODES'])
+ cfg.dist_params.node_rank = int(os.environ['NODE_RANK'])
+
+ # if int(os.environ['NNODES']) > 1 and cfg.dist_params.dist_url.startswith("file://"):
+ # raise Warning("file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://")
+
+
+def get_func(func_name):
+ """
+ Helper to return a function object by name. func_name must identify
+ a function in this module or the path to a function relative to the base
+ module.
+ @ func_name: function name.
+ """
+ if func_name == '':
+ return None
+ try:
+ parts = func_name.split('.')
+ # Refers to a function in this module
+ if len(parts) == 1:
+ return globals()[parts[0]]
+ # Otherwise, assume we're referencing a module under modeling
+ module_name = '.'.join(parts[:-1])
+ module = importlib.import_module(module_name)
+ return getattr(module, parts[-1])
+ except:
+ raise RuntimeError(f'Failed to find function: {func_name}')
+
+class Timer(object):
+ """A simple timer."""
+
+ def __init__(self):
+ self.reset()
+
+ def tic(self):
+ # using time.time instead of time.clock because time time.clock
+ # does not normalize for multithreading
+ self.start_time = time.time()
+
+ def toc(self, average=True):
+ self.diff = time.time() - self.start_time
+ self.total_time += self.diff
+ self.calls += 1
+ self.average_time = self.total_time / self.calls
+ if average:
+ return self.average_time
+ else:
+ return self.diff
+
+ def reset(self):
+ self.total_time = 0.
+ self.calls = 0
+ self.start_time = 0.
+ self.diff = 0.
+ self.average_time = 0.
+
+class TrainingStats(object):
+ """Track vital training statistics."""
+ def __init__(self, log_period, tensorboard_logger=None):
+ self.log_period = log_period
+ self.tblogger = tensorboard_logger
+ self.tb_ignored_keys = ['iter', 'eta', 'epoch', 'time']
+ self.iter_timer = Timer()
+ # Window size for smoothing tracked values (with median filtering)
+ self.filter_size = log_period
+ def create_smoothed_value():
+ return AverageMeter()
+ self.smoothed_losses = defaultdict(create_smoothed_value)
+ #self.smoothed_metrics = defaultdict(create_smoothed_value)
+ #self.smoothed_total_loss = AverageMeter()
+
+
+ def IterTic(self):
+ self.iter_timer.tic()
+
+ def IterToc(self):
+ return self.iter_timer.toc(average=False)
+
+ def reset_iter_time(self):
+ self.iter_timer.reset()
+
+ def update_iter_stats(self, losses_dict):
+ """Update tracked iteration statistics."""
+ for k, v in losses_dict.items():
+ self.smoothed_losses[k].update(float(v), 1)
+
+ def log_iter_stats(self, cur_iter, optimizer, max_iters, val_err={}):
+ """Log the tracked statistics."""
+ if (cur_iter % self.log_period == 0):
+ stats = self.get_stats(cur_iter, optimizer, max_iters, val_err)
+ log_stats(stats)
+ if self.tblogger:
+ self.tb_log_stats(stats, cur_iter)
+ for k, v in self.smoothed_losses.items():
+ v.reset()
+
+ def tb_log_stats(self, stats, cur_iter):
+ """Log the tracked statistics to tensorboard"""
+ for k in stats:
+ # ignore some logs
+ if k not in self.tb_ignored_keys:
+ v = stats[k]
+ if isinstance(v, dict):
+ self.tb_log_stats(v, cur_iter)
+ else:
+ self.tblogger.add_scalar(k, v, cur_iter)
+
+
+ def get_stats(self, cur_iter, optimizer, max_iters, val_err = {}):
+ eta_seconds = self.iter_timer.average_time * (max_iters - cur_iter)
+
+ eta = str(datetime.timedelta(seconds=int(eta_seconds)))
+ stats = OrderedDict(
+ iter=cur_iter, # 1-indexed
+ time=self.iter_timer.average_time,
+ eta=eta,
+ )
+ optimizer_state_dict = optimizer.state_dict()
+ lr = {}
+ for i in range(len(optimizer_state_dict['param_groups'])):
+ lr_name = 'group%d_lr' % i
+ lr[lr_name] = optimizer_state_dict['param_groups'][i]['lr']
+
+ stats['lr'] = OrderedDict(lr)
+ for k, v in self.smoothed_losses.items():
+ stats[k] = v.avg
+
+ stats['val_err'] = OrderedDict(val_err)
+ stats['max_iters'] = max_iters
+ return stats
+
+
+def reduce_dict(input_dict, average=True):
+ """
+ Reduce the values in the dictionary from all processes so that process with rank
+ 0 has the reduced results.
+ Args:
+ @input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
+ @average (bool): whether to do average or sum
+ Returns:
+ a dict with the same keys as input_dict, after reduction.
+ """
+ world_size = get_world_size()
+ if world_size < 2:
+ return input_dict
+ with torch.no_grad():
+ names = []
+ values = []
+ # sort the keys so that they are consistent across processes
+ for k in sorted(input_dict.keys()):
+ names.append(k)
+ values.append(input_dict[k])
+ values = torch.stack(values, dim=0)
+ dist.reduce(values, dst=0)
+ if dist.get_rank() == 0 and average:
+ # only main process gets accumulated, so only divide by
+ # world_size in this case
+ values /= world_size
+ reduced_dict = {k: v for k, v in zip(names, values)}
+ return reduced_dict
+
+
+def log_stats(stats):
+ logger = logging.getLogger()
+ """Log training statistics to terminal"""
+ lines = "[Step %d/%d]\n" % (
+ stats['iter'], stats['max_iters'])
+
+ lines += "\t\tloss: %.3f, time: %.6f, eta: %s\n" % (
+ stats['total_loss'], stats['time'], stats['eta'])
+
+ # log loss
+ lines += "\t\t"
+ for k, v in stats.items():
+ if 'loss' in k.lower() and 'total_loss' not in k.lower():
+ lines += "%s: %.3f" % (k, v) + ", "
+ lines = lines[:-3]
+ lines += '\n'
+
+ # validate criteria
+ lines += "\t\tlast val err:" + ", ".join("%s: %.6f" % (k, v) for k, v in stats['val_err'].items()) + ", "
+ lines += '\n'
+
+ # lr in different groups
+ lines += "\t\t" + ", ".join("%s: %.8f" % (k, v) for k, v in stats['lr'].items())
+ lines += '\n'
+ logger.info(lines[:-1]) # remove last new linen_pxl
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/custom_data.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/custom_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff860708abeef3d38c873ea3ae0b2c853336ec56
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/custom_data.py
@@ -0,0 +1,34 @@
+import glob
+import os
+import json
+import cv2
+
+def load_from_annos(anno_path):
+ with open(anno_path, 'r') as f:
+ annos = json.load(f)['files']
+
+ datas = []
+ for i, anno in enumerate(annos):
+ rgb = anno['rgb']
+ depth = anno['depth'] if 'depth' in anno else None
+ depth_scale = anno['depth_scale'] if 'depth_scale' in anno else 1.0
+ intrinsic = anno['cam_in'] if 'cam_in' in anno else None
+ normal = anno['normal'] if 'normal' in anno else None
+
+ data_i = {
+ 'rgb': rgb,
+ 'depth': depth,
+ 'depth_scale': depth_scale,
+ 'intrinsic': intrinsic,
+ 'filename': os.path.basename(rgb),
+ 'folder': rgb.split('/')[-3],
+ 'normal': normal
+ }
+ datas.append(data_i)
+ return datas
+
+def load_data(path: str):
+ rgbs = glob.glob(path + '/*.jpg') + glob.glob(path + '/*.png')
+ #intrinsic = [835.8179931640625, 835.8179931640625, 961.5419921875, 566.8090209960938] #[721.53769, 721.53769, 609.5593, 172.854]
+ data = [{'rgb': i, 'depth': None, 'intrinsic': None, 'filename': os.path.basename(i), 'folder': i.split('/')[-3]} for i in rgbs]
+ return data
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/do_test.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/do_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b22c6aca0fbb60fd0c02efb3ecbfc2fcfdfd143
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/do_test.py
@@ -0,0 +1,380 @@
+import torch
+import torch.nn.functional as F
+import logging
+import os
+import os.path as osp
+from custom_controlnet_aux.metric3d.mono.utils.avg_meter import MetricAverageMeter
+from custom_controlnet_aux.metric3d.mono.utils.visualization import save_val_imgs, create_html, save_raw_imgs, save_normal_val_imgs
+import cv2
+from tqdm import tqdm
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+
+def to_cuda(data: dict):
+ for k, v in data.items():
+ if isinstance(v, torch.Tensor):
+ data[k] = v.cuda(non_blocking=True)
+ if isinstance(v, list) and len(v)>=1 and isinstance(v[0], torch.Tensor):
+ for i, l_i in enumerate(v):
+ data[k][i] = l_i.cuda(non_blocking=True)
+ return data
+
+def align_scale(pred: torch.tensor, target: torch.tensor):
+ mask = target > 0
+ if torch.sum(mask) > 10:
+ scale = torch.median(target[mask]) / (torch.median(pred[mask]) + 1e-8)
+ else:
+ scale = 1
+ pred_scaled = pred * scale
+ return pred_scaled, scale
+
+def align_scale_shift(pred: torch.tensor, target: torch.tensor):
+ mask = target > 0
+ target_mask = target[mask].cpu().numpy()
+ pred_mask = pred[mask].cpu().numpy()
+ if torch.sum(mask) > 10:
+ scale, shift = np.polyfit(pred_mask, target_mask, deg=1)
+ if scale < 0:
+ scale = torch.median(target[mask]) / (torch.median(pred[mask]) + 1e-8)
+ shift = 0
+ else:
+ scale = 1
+ shift = 0
+ pred = pred * scale + shift
+ return pred, scale
+
+def align_scale_shift_numpy(pred: np.array, target: np.array):
+ mask = target > 0
+ target_mask = target[mask]
+ pred_mask = pred[mask]
+ if np.sum(mask) > 10:
+ scale, shift = np.polyfit(pred_mask, target_mask, deg=1)
+ if scale < 0:
+ scale = np.median(target[mask]) / (np.median(pred[mask]) + 1e-8)
+ shift = 0
+ else:
+ scale = 1
+ shift = 0
+ pred = pred * scale + shift
+ return pred, scale
+
+
+def build_camera_model(H : int, W : int, intrinsics : list) -> np.array:
+ """
+ Encode the camera intrinsic parameters (focal length and principle point) to a 4-channel map.
+ """
+ fx, fy, u0, v0 = intrinsics
+ f = (fx + fy) / 2.0
+ # principle point location
+ x_row = np.arange(0, W).astype(np.float32)
+ x_row_center_norm = (x_row - u0) / W
+ x_center = np.tile(x_row_center_norm, (H, 1)) # [H, W]
+
+ y_col = np.arange(0, H).astype(np.float32)
+ y_col_center_norm = (y_col - v0) / H
+ y_center = np.tile(y_col_center_norm, (W, 1)).T # [H, W]
+
+ # FoV
+ fov_x = np.arctan(x_center / (f / W))
+ fov_y = np.arctan(y_center / (f / H))
+
+ cam_model = np.stack([x_center, y_center, fov_x, fov_y], axis=2)
+ return cam_model
+
+def resize_for_input(image, output_shape, intrinsic, canonical_shape, to_canonical_ratio):
+ """
+ Resize the input.
+ Resizing consists of two processed, i.e. 1) to the canonical space (adjust the camera model); 2) resize the image while the camera model holds. Thus the
+ label will be scaled with the resize factor.
+ """
+ padding = [123.675, 116.28, 103.53]
+ h, w, _ = image.shape
+ resize_ratio_h = output_shape[0] / canonical_shape[0]
+ resize_ratio_w = output_shape[1] / canonical_shape[1]
+ to_scale_ratio = min(resize_ratio_h, resize_ratio_w)
+
+ resize_ratio = to_canonical_ratio * to_scale_ratio
+
+ reshape_h = int(resize_ratio * h)
+ reshape_w = int(resize_ratio * w)
+
+ pad_h = max(output_shape[0] - reshape_h, 0)
+ pad_w = max(output_shape[1] - reshape_w, 0)
+ pad_h_half = int(pad_h / 2)
+ pad_w_half = int(pad_w / 2)
+
+ # resize
+ image = cv2.resize(image, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_LINEAR)
+ # padding
+ image = cv2.copyMakeBorder(
+ image,
+ pad_h_half,
+ pad_h - pad_h_half,
+ pad_w_half,
+ pad_w - pad_w_half,
+ cv2.BORDER_CONSTANT,
+ value=padding)
+
+ # Resize, adjust principle point
+ intrinsic[2] = intrinsic[2] * to_scale_ratio
+ intrinsic[3] = intrinsic[3] * to_scale_ratio
+
+ cam_model = build_camera_model(reshape_h, reshape_w, intrinsic)
+ cam_model = cv2.copyMakeBorder(
+ cam_model,
+ pad_h_half,
+ pad_h - pad_h_half,
+ pad_w_half,
+ pad_w - pad_w_half,
+ cv2.BORDER_CONSTANT,
+ value=-1)
+
+ pad=[pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
+ label_scale_factor=1/to_scale_ratio
+ return image, cam_model, pad, label_scale_factor
+
+
+def get_prediction(
+ model: torch.nn.Module,
+ input: torch.tensor,
+ cam_model: torch.tensor,
+ pad_info: torch.tensor,
+ scale_info: torch.tensor,
+ gt_depth: torch.tensor,
+ normalize_scale: float,
+ ori_shape: list=[],
+):
+
+ data = dict(
+ input=input,
+ cam_model=cam_model,
+ )
+ pred_depth, confidence, output_dict = model.inference(data)
+
+ return pred_depth, confidence, output_dict
+
+def transform_test_data_scalecano(rgb, intrinsic, data_basic, device="cuda"):
+ """
+ Pre-process the input for forwarding. Employ `label scale canonical transformation.'
+ Args:
+ rgb: input rgb image. [H, W, 3]
+ intrinsic: camera intrinsic parameter, [fx, fy, u0, v0]
+ data_basic: predefined canonical space in configs.
+ """
+ canonical_space = data_basic['canonical_space']
+ forward_size = data_basic.crop_size
+ mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
+ std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
+
+ # BGR to RGB
+ #rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
+
+ ori_h, ori_w, _ = rgb.shape
+ ori_focal = (intrinsic[0] + intrinsic[1]) / 2
+ canonical_focal = canonical_space['focal_length']
+
+ cano_label_scale_ratio = canonical_focal / ori_focal
+
+ canonical_intrinsic = [
+ intrinsic[0] * cano_label_scale_ratio,
+ intrinsic[1] * cano_label_scale_ratio,
+ intrinsic[2],
+ intrinsic[3],
+ ]
+
+ # resize
+ rgb, cam_model, pad, resize_label_scale_ratio = resize_for_input(rgb, forward_size, canonical_intrinsic, [ori_h, ori_w], 1.0)
+
+ # label scale factor
+ label_scale_factor = cano_label_scale_ratio * resize_label_scale_ratio
+
+ rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
+ rgb = torch.div((rgb - mean), std)
+ rgb = rgb.to(device)
+
+ cam_model = torch.from_numpy(cam_model.transpose((2, 0, 1))).float()
+ cam_model = cam_model[None, :, :, :].to(device)
+ cam_model_stacks = [
+ torch.nn.functional.interpolate(cam_model, size=(cam_model.shape[2]//i, cam_model.shape[3]//i), mode='bilinear', align_corners=False)
+ for i in [2, 4, 8, 16, 32]
+ ]
+ return rgb, cam_model_stacks, pad, label_scale_factor
+
+def do_scalecano_test_with_custom_data(
+ model: torch.nn.Module,
+ cfg: dict,
+ test_data: list,
+ logger: logging.RootLogger,
+ is_distributed: bool = True,
+ local_rank: int = 0,
+ bs: int = 2, # Batch size parameter
+):
+
+ show_dir = cfg.show_dir
+ save_interval = 1
+ save_imgs_dir = show_dir + '/vis'
+ os.makedirs(save_imgs_dir, exist_ok=True)
+ save_pcd_dir = show_dir + '/pcd'
+ os.makedirs(save_pcd_dir, exist_ok=True)
+
+ normalize_scale = cfg.data_basic.depth_range[1]
+ dam = MetricAverageMeter(['abs_rel', 'rmse', 'silog', 'delta1', 'delta2', 'delta3'])
+ dam_median = MetricAverageMeter(['abs_rel', 'rmse', 'silog', 'delta1', 'delta2', 'delta3'])
+ dam_global = MetricAverageMeter(['abs_rel', 'rmse', 'silog', 'delta1', 'delta2', 'delta3'])
+
+ # Process data in batches
+ for i in tqdm(range(0, len(test_data), bs)):
+ batch_data = test_data[i:i + bs] # Extract batch
+ rgb_inputs, pads, label_scale_factors, gt_depths, rgb_origins = [], [], [], [], []
+
+ for an in batch_data:
+ print(an['rgb'])
+ rgb_origin = cv2.imread(an['rgb'])[:, :, ::-1].copy()
+ rgb_origins.append(rgb_origin)
+ gt_depth = None
+ if an['depth'] is not None:
+ gt_depth = cv2.imread(an['depth'], -1)
+ gt_depth_scale = an['depth_scale']
+ gt_depth = gt_depth / gt_depth_scale
+ gt_depths.append(gt_depth)
+
+ intrinsic = an['intrinsic']
+ if intrinsic is None:
+ intrinsic = [1000.0, 1000.0, rgb_origin.shape[1]/2, rgb_origin.shape[0]/2]
+
+ rgb_input, _, pad, label_scale_factor = transform_test_data_scalecano(rgb_origin, intrinsic, cfg.data_basic)
+ rgb_inputs.append(rgb_input)
+ pads.append(pad)
+ label_scale_factors.append(label_scale_factor)
+
+ # Process the batch
+ pred_depths, outputs = get_prediction(
+ model=model,
+ input=torch.stack(rgb_inputs), # Stack inputs for batch processing
+ cam_model=None,
+ pad_info=pads,
+ scale_info=None,
+ gt_depth=None,
+ normalize_scale=None,
+ )
+
+ for j, gt_depth in enumerate(gt_depths):
+ normal_out = None
+ if 'normal_out_list' in outputs.keys():
+ normal_out = outputs['normal_out_list'][0][j, :]
+
+ postprocess_per_image(
+ i*bs+j,
+ pred_depths[j, :],
+ gt_depth,
+ intrinsic,
+ rgb_origins[j],
+ normal_out,
+ pads[j],
+ batch_data[j],
+ dam,
+ dam_median,
+ dam_global,
+ is_distributed,
+ save_imgs_dir,
+ save_pcd_dir,
+ normalize_scale,
+ label_scale_factors[j],
+ )
+
+ #if gt_depth_flag:
+ if False:
+ eval_error = dam.get_metrics()
+ print('w/o match :', eval_error)
+
+ eval_error_median = dam_median.get_metrics()
+ print('median match :', eval_error_median)
+
+ eval_error_global = dam_global.get_metrics()
+ print('global match :', eval_error_global)
+ else:
+ print('missing gt_depth, only save visualizations...')
+
+
+def postprocess_per_image(i, pred_depth, gt_depth, intrinsic, rgb_origin, normal_out, pad, an, dam, dam_median, dam_global, is_distributed, save_imgs_dir, save_pcd_dir, normalize_scale, scale_info):
+
+ pred_depth = pred_depth.squeeze()
+ pred_depth = pred_depth[pad[0] : pred_depth.shape[0] - pad[1], pad[2] : pred_depth.shape[1] - pad[3]]
+ pred_depth = torch.nn.functional.interpolate(pred_depth[None, None, :, :], [rgb_origin.shape[0], rgb_origin.shape[1]], mode='bilinear').squeeze() # to original size
+ pred_depth = pred_depth * normalize_scale / scale_info
+
+ pred_depth = (pred_depth > 0) * (pred_depth < 300) * pred_depth
+ if gt_depth is not None:
+
+ pred_depth = torch.nn.functional.interpolate(pred_depth[None, None, :, :], (gt_depth.shape[0], gt_depth.shape[1]), mode='bilinear').squeeze() # to original size
+
+ gt_depth = torch.from_numpy(gt_depth).cuda()
+
+ pred_depth_median = pred_depth * gt_depth[gt_depth != 0].median() / pred_depth[gt_depth != 0].median()
+ pred_global, _ = align_scale_shift(pred_depth, gt_depth)
+
+ mask = (gt_depth > 1e-8)
+ dam.update_metrics_gpu(pred_depth, gt_depth, mask, is_distributed)
+ dam_median.update_metrics_gpu(pred_depth_median, gt_depth, mask, is_distributed)
+ dam_global.update_metrics_gpu(pred_global, gt_depth, mask, is_distributed)
+ print(gt_depth[gt_depth != 0].median() / pred_depth[gt_depth != 0].median(), )
+
+ os.makedirs(osp.join(save_imgs_dir, an['folder']), exist_ok=True)
+ rgb_torch = torch.from_numpy(rgb_origin).to(pred_depth.device).permute(2, 0, 1)
+ mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None].to(rgb_torch.device)
+ std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None].to(rgb_torch.device)
+ rgb_torch = torch.div((rgb_torch - mean), std)
+
+ save_val_imgs(
+ i,
+ pred_depth,
+ gt_depth if gt_depth is not None else torch.ones_like(pred_depth, device=pred_depth.device),
+ rgb_torch,
+ osp.join(an['folder'], an['filename']),
+ save_imgs_dir,
+ )
+ #save_raw_imgs(pred_depth.detach().cpu().numpy(), rgb_torch, osp.join(an['folder'], an['filename']), save_imgs_dir, 1000.0)
+
+ # pcd
+ pred_depth = pred_depth.detach().cpu().numpy()
+ #pcd = reconstruct_pcd(pred_depth, intrinsic[0], intrinsic[1], intrinsic[2], intrinsic[3])
+ #os.makedirs(osp.join(save_pcd_dir, an['folder']), exist_ok=True)
+ #save_point_cloud(pcd.reshape((-1, 3)), rgb_origin.reshape(-1, 3), osp.join(save_pcd_dir, an['folder'], an['filename'][:-4]+'.ply'))
+
+ if an['intrinsic'] == None:
+ #for r in [0.9, 1.0, 1.1]:
+ for r in [1.0]:
+ #for f in [600, 800, 1000, 1250, 1500]:
+ for f in [1000]:
+ pcd = reconstruct_pcd(pred_depth, f * r, f * (2-r), intrinsic[2], intrinsic[3])
+ fstr = '_fx_' + str(int(f * r)) + '_fy_' + str(int(f * (2-r)))
+ os.makedirs(osp.join(save_pcd_dir, an['folder']), exist_ok=True)
+ save_point_cloud(pcd.reshape((-1, 3)), rgb_origin.reshape(-1, 3), osp.join(save_pcd_dir, an['folder'], an['filename'][:-4] + fstr +'.ply'))
+
+ if normal_out is not None:
+ pred_normal = normal_out[:3, :, :] # (3, H, W)
+ H, W = pred_normal.shape[1:]
+ pred_normal = pred_normal[ :, pad[0]:H-pad[1], pad[2]:W-pad[3]]
+
+ gt_normal = None
+ #if gt_normal_flag:
+ if False:
+ pred_normal = torch.nn.functional.interpolate(pred_normal, size=gt_normal.shape[2:], mode='bilinear', align_corners=True)
+ gt_normal = cv2.imread(norm_path)
+ gt_normal = cv2.cvtColor(gt_normal, cv2.COLOR_BGR2RGB)
+ gt_normal = np.array(gt_normal).astype(np.uint8)
+ gt_normal = ((gt_normal.astype(np.float32) / 255.0) * 2.0) - 1.0
+ norm_valid_mask = (np.linalg.norm(gt_normal, axis=2, keepdims=True) > 0.5)
+ gt_normal = gt_normal * norm_valid_mask
+ gt_normal_mask = ~torch.all(gt_normal == 0, dim=1, keepdim=True)
+ dam.update_normal_metrics_gpu(pred_normal, gt_normal, gt_normal_mask, cfg.distributed)# save valiad normal
+
+ save_normal_val_imgs(iter,
+ pred_normal,
+ gt_normal if gt_normal is not None else torch.ones_like(pred_normal, device=pred_normal.device),
+ rgb_torch, # data['input'],
+ osp.join(an['folder'], 'normal_'+an['filename']),
+ save_imgs_dir,
+ )
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/logger.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00aadd47185aa968675662051137d6564c1e2e5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/logger.py
@@ -0,0 +1,102 @@
+import atexit
+import logging
+import os
+import sys
+import time
+import torch
+from termcolor import colored
+
+__all__ = ["setup_logger", ]
+
+class _ColorfulFormatter(logging.Formatter):
+ def __init__(self, *args, **kwargs):
+ self._root_name = kwargs.pop("root_name") + "."
+ self._abbrev_name = kwargs.pop("abbrev_name", "")
+ if len(self._abbrev_name):
+ self._abbrev_name = self._abbrev_name + "."
+ super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+
+ def formatMessage(self, record):
+ record.name = record.name.replace(self._root_name, self._abbrev_name)
+ log = super(_ColorfulFormatter, self).formatMessage(record)
+ if record.levelno == logging.WARNING:
+ prefix = colored("WARNING", "red", attrs=["blink"])
+ elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+ prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+ else:
+ return log
+ return prefix + " " + log
+
+def setup_logger(
+ output=None, distributed_rank=0, *, name='metricdepth', color=True, abbrev_name=None
+):
+ """
+ Initialize the detectron2 logger and set its verbosity level to "DEBUG".
+ Args:
+ output (str): a file name or a directory to save log. If None, will not save log file.
+ If ends with ".txt" or ".log", assumed to be a file name.
+ Otherwise, logs will be saved to `output/log.txt`.
+ abbrev_name (str): an abbreviation of the module, to avoid log names in logs.
+ Set to "" not log the root module in logs.
+ By default, will abbreviate "detectron2" to "d2" and leave other
+ modules unchanged.
+ Returns:
+ logging.Logger: a logger
+ """
+ logger = logging.getLogger()
+ logger.setLevel(logging.INFO) # NOTE: if more detailed, change it to logging.DEBUG
+ logger.propagate = False
+
+ if abbrev_name is None:
+ abbrev_name = "d2"
+
+ plain_formatter = logging.Formatter(
+ "[%(asctime)s] %(name)s %(levelname)s %(message)s ", datefmt="%m/%d %H:%M:%S"
+ )
+ # stdout logging: master only
+ if distributed_rank == 0:
+ ch = logging.StreamHandler(stream=sys.stdout)
+ ch.setLevel(logging.INFO) # NOTE: if more detailed, change it to logging.DEBUG
+ if color:
+ formatter = _ColorfulFormatter(
+ colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
+ datefmt="%m/%d %H:%M:%S",
+ root_name=name,
+ abbrev_name=str(abbrev_name),
+ )
+ else:
+ formatter = plain_formatter
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+
+ # file logging: all workers
+ if output is not None:
+ if output.endswith(".txt") or output.endswith(".log"):
+ filename = output
+ else:
+ filename = os.path.join(output, "log.txt")
+ if distributed_rank > 0:
+ filename = filename + ".rank{}".format(distributed_rank)
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+ fh = logging.StreamHandler(_cached_log_stream(filename))
+ fh.setLevel(logging.INFO) # NOTE: if more detailed, change it to logging.DEBUG
+ fh.setFormatter(plain_formatter)
+ logger.addHandler(fh)
+
+
+ return logger
+
+from iopath.common.file_io import PathManager as PathManagerBase
+
+
+PathManager = PathManagerBase()
+
+# cache the opened file object, so that different calls to 'setup_logger
+# with the same file name can safely write to the same file.
+def _cached_log_stream(filename):
+ # use 1K buffer if writting to cloud storage
+ io = PathManager.open(filename, "a", buffering=1024 if "://" in filename else -1)
+ atexit.register(io.close)
+ return io
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/mldb.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/mldb.py
new file mode 100644
index 0000000000000000000000000000000000000000..c018a8154a85bc5e83e595e400fcf3903ad293bf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/mldb.py
@@ -0,0 +1,34 @@
+from types import ModuleType
+import data_info
+
+def load_data_info(module_name, data_info={}, mldb_type='mldb_info', module=None):
+ if module is None:
+ module = globals().get(module_name, None)
+ if module:
+ for key, value in module.__dict__.items():
+ if not (key.startswith('__')) and not (key.startswith('_')):
+ if key == 'mldb_info':
+ data_info.update(value)
+ elif isinstance(value, ModuleType):
+ load_data_info(module_name + '.' + key, data_info, module=value)
+ else:
+ raise RuntimeError(f'Try to access "mldb_info", but cannot find {module_name} module.')
+
+def reset_ckpt_path(cfg, data_info):
+ if isinstance(cfg, dict):
+ for key in cfg.keys():
+ if key == 'backbone':
+ new_ckpt_path = data_info['checkpoint']['mldb_root'] + '/' + data_info['checkpoint'][cfg.backbone.type]
+ cfg.backbone.update(checkpoint=new_ckpt_path)
+ continue
+ elif isinstance(cfg.get(key), dict):
+ reset_ckpt_path(cfg.get(key), data_info)
+ else:
+ continue
+ else:
+ return
+
+if __name__ == '__main__':
+ mldb_info_tmp = {}
+ load_data_info('mldb_data_info', mldb_info_tmp)
+ print('results', mldb_info_tmp.keys())
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/pcd_filter.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/pcd_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f411433701bc7e615b19486687964ea8600fa9e7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/pcd_filter.py
@@ -0,0 +1,24 @@
+import open3d as o3d
+import numpy as np
+
+def downsample_and_filter(pcd_file):
+ pcd = o3d.io.read_point_cloud(pcd_file, max_bound_div = 750, neighbor_num = 8)
+ point_num = len(pcd.points)
+ if (point_num > 10000000):
+ voxel_down_pcd = o3d.geometry.PointCloud.uniform_down_sample(pcd, int(point_num / 10000000)+1)
+ else:
+ voxel_down_pcd = pcd
+ max_bound = voxel_down_pcd.get_max_bound()
+ ball_radius = np.linalg.norm(max_bound) / max_bound_div
+ pcd_filter, _ = voxel_down_pcd.remove_radius_outlier(neighbor_num, ball_radius)
+ print('filtered size', len(pcd_filter.points), 'pre size:', len(pcd.points))
+ o3d.io.write_point_cloud(pcd_file[:-4] + '_filtered.ply', pcd_filter)
+
+
+if __name__ == "__main__":
+ import os
+ dir_path = './data/demo_pcd'
+ for pcd_file in os.listdir(dir_path):
+ #if 'jonathan' in pcd_file: set max_bound_div to 300 and neighbot_num to 8
+ downsample_and_filter(os.path.join(dir_path, pcd_file))
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/running.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/running.py
new file mode 100644
index 0000000000000000000000000000000000000000..00f692aebbfd5d87e1da99e7303f6958e790ed2b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/running.py
@@ -0,0 +1,77 @@
+import os
+import torch
+import torch.nn as nn
+from custom_controlnet_aux.metric3d.mono.utils.comm import main_process
+import copy
+import inspect
+import logging
+import glob
+
+
+def load_ckpt(load_path, model, optimizer=None, scheduler=None, strict_match=True, loss_scaler=None):
+ """
+ Load the check point for resuming training or finetuning.
+ """
+ logger = logging.getLogger()
+ if os.path.isfile(load_path):
+ if main_process():
+ logger.info(f"Loading weight '{load_path}'")
+ checkpoint = torch.load(load_path, map_location="cpu", weights_only=True)
+ ckpt_state_dict = checkpoint['model_state_dict']
+ model.load_state_dict(ckpt_state_dict, strict=strict_match)
+
+ if optimizer is not None:
+ optimizer.load_state_dict(checkpoint['optimizer'])
+ if scheduler is not None:
+ scheduler.load_state_dict(checkpoint['scheduler'])
+ if loss_scaler is not None and 'scaler' in checkpoint:
+ scheduler.load_state_dict(checkpoint['scaler'])
+ del ckpt_state_dict
+ del checkpoint
+ if main_process():
+ logger.info(f"Successfully loaded weight: '{load_path}'")
+ if scheduler is not None and optimizer is not None:
+ logger.info(f"Resume training from: '{load_path}'")
+ else:
+ if main_process():
+ raise RuntimeError(f"No weight found at '{load_path}'")
+ return model, optimizer, scheduler, loss_scaler
+
+
+def save_ckpt(cfg, model, optimizer, scheduler, curr_iter=0, curr_epoch=None, loss_scaler=None):
+ """
+ Save the model, optimizer, lr scheduler.
+ """
+ logger = logging.getLogger()
+
+ if 'IterBasedRunner' in cfg.runner.type:
+ max_iters = cfg.runner.max_iters
+ elif 'EpochBasedRunner' in cfg.runner.type:
+ max_iters = cfg.runner.max_epochs
+ else:
+ raise TypeError(f'{cfg.runner.type} is not supported')
+
+ ckpt = dict(
+ model_state_dict=model.module.state_dict(),
+ optimizer=optimizer.state_dict(),
+ max_iter=cfg.runner.max_iters if 'max_iters' in cfg.runner \
+ else cfg.runner.max_epochs,
+ scheduler=scheduler.state_dict(),
+ )
+
+ if loss_scaler is not None:
+ ckpt.update(dict(scaler=loss_scaler.state_dict()))
+
+ ckpt_dir = os.path.join(cfg.work_dir, 'ckpt')
+ os.makedirs(ckpt_dir, exist_ok=True)
+
+ save_name = os.path.join(ckpt_dir, 'step%08d.pth' %curr_iter)
+ saved_ckpts = glob.glob(ckpt_dir + '/step*.pth')
+ torch.save(ckpt, save_name)
+
+ # keep the last 8 ckpts
+ if len(saved_ckpts) > 20:
+ saved_ckpts.sort()
+ os.remove(saved_ckpts.pop(0))
+
+ logger.info(f'Save model: {save_name}')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/transform.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..41972b72258cd392b5a4343298f4a88c1fdf7c47
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/transform.py
@@ -0,0 +1,408 @@
+import collections
+import cv2
+import math
+import numpy as np
+import numbers
+import random
+import torch
+
+import matplotlib
+import matplotlib.cm
+
+
+"""
+Provides a set of Pytorch transforms that use OpenCV instead of PIL (Pytorch default)
+for image manipulation.
+"""
+
+class Compose(object):
+ # Composes transforms: transforms.Compose([transforms.RandScale([0.5, 2.0]), transforms.ToTensor()])
+ def __init__(self, transforms):
+ self.transforms = transforms
+
+ def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
+ for t in self.transforms:
+ images, labels, intrinsics, cam_models, other_labels, transform_paras = t(images, labels, intrinsics, cam_models, other_labels, transform_paras)
+ return images, labels, intrinsics, cam_models, other_labels, transform_paras
+
+
+class ToTensor(object):
+ # Converts numpy.ndarray (H x W x C) to a torch.FloatTensor of shape (C x H x W).
+ def __init__(self, **kwargs):
+ return
+ def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
+ if not isinstance(images, list) or not isinstance(labels, list) or not isinstance(intrinsics, list):
+ raise (RuntimeError("transform.ToTensor() only handle inputs/labels/intrinsics lists."))
+ if len(images) != len(intrinsics):
+ raise (RuntimeError("Numbers of images and intrinsics are not matched."))
+ if not isinstance(images[0], np.ndarray) or not isinstance(labels[0], np.ndarray):
+ raise (RuntimeError("transform.ToTensor() only handle np.ndarray for the input and label."
+ "[eg: data readed by cv2.imread()].\n"))
+ if not isinstance(intrinsics[0], list):
+ raise (RuntimeError("transform.ToTensor() only handle list for the camera intrinsics"))
+
+ if len(images[0].shape) > 3 or len(images[0].shape) < 2:
+ raise (RuntimeError("transform.ToTensor() only handle image(np.ndarray) with 3 dims or 2 dims.\n"))
+ if len(labels[0].shape) > 3 or len(labels[0].shape) < 2:
+ raise (RuntimeError("transform.ToTensor() only handle label(np.ndarray) with 3 dims or 2 dims.\n"))
+
+ if len(intrinsics[0]) >4 or len(intrinsics[0]) < 3:
+ raise (RuntimeError("transform.ToTensor() only handle intrinsic(list) with 3 sizes or 4 sizes.\n"))
+
+ for i, img in enumerate(images):
+ if len(img.shape) == 2:
+ img = np.expand_dims(img, axis=2)
+ images[i] = torch.from_numpy(img.transpose((2, 0, 1))).float()
+ for i, lab in enumerate(labels):
+ if len(lab.shape) == 2:
+ lab = np.expand_dims(lab, axis=0)
+ labels[i] = torch.from_numpy(lab).float()
+ for i, intrinsic in enumerate(intrinsics):
+ if len(intrinsic) == 3:
+ intrinsic = [intrinsic[0],] + intrinsic
+ intrinsics[i] = torch.tensor(intrinsic, dtype=torch.float)
+ if cam_models is not None:
+ for i, cam_model in enumerate(cam_models):
+ cam_models[i] = torch.from_numpy(cam_model.transpose((2, 0, 1))).float() if cam_model is not None else None
+ if other_labels is not None:
+ for i, lab in enumerate(other_labels):
+ if len(lab.shape) == 2:
+ lab = np.expand_dims(lab, axis=0)
+ other_labels[i] = torch.from_numpy(lab).float()
+ return images, labels, intrinsics, cam_models, other_labels, transform_paras
+
+
+class Normalize(object):
+ # Normalize tensor with mean and standard deviation along channel: channel = (channel - mean) / std
+ def __init__(self, mean, std=None, **kwargs):
+ if std is None:
+ assert len(mean) > 0
+ else:
+ assert len(mean) == len(std)
+ self.mean = torch.tensor(mean).float()[:, None, None]
+ self.std = torch.tensor(std).float()[:, None, None] if std is not None \
+ else torch.tensor([1.0, 1.0, 1.0]).float()[:, None, None]
+
+ def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
+ # if self.std is None:
+ # # for t, m in zip(image, self.mean):
+ # # t.sub(m)
+ # image = image - self.mean
+ # if ref_images is not None:
+ # for i, ref_i in enumerate(ref_images):
+ # ref_images[i] = ref_i - self.mean
+ # else:
+ # # for t, m, s in zip(image, self.mean, self.std):
+ # # t.sub(m).div(s)
+ # image = (image - self.mean) / self.std
+ # if ref_images is not None:
+ # for i, ref_i in enumerate(ref_images):
+ # ref_images[i] = (ref_i - self.mean) / self.std
+ for i, img in enumerate(images):
+ img = torch.div((img - self.mean), self.std)
+ images[i] = img
+ return images, labels, intrinsics, cam_models, other_labels, transform_paras
+
+
+class LableScaleCanonical(object):
+ """
+ To solve the ambiguity observation for the mono branch, i.e. different focal length (object size) with the same depth, cameras are
+ mapped to a canonical space. To mimic this, we set the focal length to a canonical one and scale the depth value. NOTE: resize the image based on the ratio can also solve
+ Args:
+ images: list of RGB images.
+ labels: list of depth/disparity labels.
+ other labels: other labels, such as instance segmentations, semantic segmentations...
+ """
+ def __init__(self, **kwargs):
+ self.canonical_focal = kwargs['focal_length']
+
+ def _get_scale_ratio(self, intrinsic):
+ target_focal_x = intrinsic[0]
+ label_scale_ratio = self.canonical_focal / target_focal_x
+ pose_scale_ratio = 1.0
+ return label_scale_ratio, pose_scale_ratio
+
+ def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
+ assert len(images[0].shape) == 3 and len(labels[0].shape) == 2
+ assert labels[0].dtype == np.float32
+
+ label_scale_ratio = None
+ pose_scale_ratio = None
+
+ for i in range(len(intrinsics)):
+ img_i = images[i]
+ label_i = labels[i] if i < len(labels) else None
+ intrinsic_i = intrinsics[i].copy()
+ cam_model_i = cam_models[i] if cam_models is not None and i < len(cam_models) else None
+
+ label_scale_ratio, pose_scale_ratio = self._get_scale_ratio(intrinsic_i)
+
+ # adjust the focal length, map the current camera to the canonical space
+ intrinsics[i] = [intrinsic_i[0] * label_scale_ratio, intrinsic_i[1] * label_scale_ratio, intrinsic_i[2], intrinsic_i[3]]
+
+ # scale the label to the canonical space
+ if label_i is not None:
+ labels[i] = label_i * label_scale_ratio
+
+ if cam_model_i is not None:
+ # As the focal length is adjusted (canonical focal length), the camera model should be re-built
+ ori_h, ori_w, _ = img_i.shape
+ cam_models[i] = build_camera_model(ori_h, ori_w, intrinsics[i])
+
+
+ if transform_paras is not None:
+ transform_paras.update(label_scale_factor=label_scale_ratio, focal_scale_factor=label_scale_ratio)
+
+ return images, labels, intrinsics, cam_models, other_labels, transform_paras
+
+
+class ResizeKeepRatio(object):
+ """
+ Resize and pad to a given size. Hold the aspect ratio.
+ This resizing assumes that the camera model remains unchanged.
+ Args:
+ resize_size: predefined output size.
+ """
+ def __init__(self, resize_size, padding=None, ignore_label=-1, **kwargs):
+ if isinstance(resize_size, int):
+ self.resize_h = resize_size
+ self.resize_w = resize_size
+ elif isinstance(resize_size, collections.Iterable) and len(resize_size) == 2 \
+ and isinstance(resize_size[0], int) and isinstance(resize_size[1], int) \
+ and resize_size[0] > 0 and resize_size[1] > 0:
+ self.resize_h = resize_size[0]
+ self.resize_w = resize_size[1]
+ else:
+ raise (RuntimeError("crop size error.\n"))
+ if padding is None:
+ self.padding = padding
+ elif isinstance(padding, list):
+ if all(isinstance(i, numbers.Number) for i in padding):
+ self.padding = padding
+ else:
+ raise (RuntimeError("padding in Crop() should be a number list\n"))
+ if len(padding) != 3:
+ raise (RuntimeError("padding channel is not equal with 3\n"))
+ else:
+ raise (RuntimeError("padding in Crop() should be a number list\n"))
+ if isinstance(ignore_label, int):
+ self.ignore_label = ignore_label
+ else:
+ raise (RuntimeError("ignore_label should be an integer number\n"))
+ # self.crop_size = kwargs['crop_size']
+ self.canonical_focal = kwargs['focal_length']
+
+ def main_data_transform(self, image, label, intrinsic, cam_model, resize_ratio, padding, to_scale_ratio):
+ """
+ Resize data first and then do the padding.
+ 'label' will be scaled.
+ """
+ h, w, _ = image.shape
+ reshape_h = int(resize_ratio * h)
+ reshape_w = int(resize_ratio * w)
+
+ pad_h, pad_w, pad_h_half, pad_w_half = padding
+
+ # resize
+ image = cv2.resize(image, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_LINEAR)
+ # padding
+ image = cv2.copyMakeBorder(
+ image,
+ pad_h_half,
+ pad_h - pad_h_half,
+ pad_w_half,
+ pad_w - pad_w_half,
+ cv2.BORDER_CONSTANT,
+ value=self.padding)
+
+ if label is not None:
+ # label = cv2.resize(label, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_NEAREST)
+ label = resize_depth_preserve(label, (reshape_h, reshape_w))
+ label = cv2.copyMakeBorder(
+ label,
+ pad_h_half,
+ pad_h - pad_h_half,
+ pad_w_half,
+ pad_w - pad_w_half,
+ cv2.BORDER_CONSTANT,
+ value=self.ignore_label)
+ # scale the label
+ label = label / to_scale_ratio
+
+ # Resize, adjust principle point
+ if intrinsic is not None:
+ intrinsic[0] = intrinsic[0] * resize_ratio / to_scale_ratio
+ intrinsic[1] = intrinsic[1] * resize_ratio / to_scale_ratio
+ intrinsic[2] = intrinsic[2] * resize_ratio
+ intrinsic[3] = intrinsic[3] * resize_ratio
+
+ if cam_model is not None:
+ #cam_model = cv2.resize(cam_model, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_LINEAR)
+ cam_model = build_camera_model(reshape_h, reshape_w, intrinsic)
+ cam_model = cv2.copyMakeBorder(
+ cam_model,
+ pad_h_half,
+ pad_h - pad_h_half,
+ pad_w_half,
+ pad_w - pad_w_half,
+ cv2.BORDER_CONSTANT,
+ value=self.ignore_label)
+
+ # Pad, adjust the principle point
+ if intrinsic is not None:
+ intrinsic[2] = intrinsic[2] + pad_w_half
+ intrinsic[3] = intrinsic[3] + pad_h_half
+ return image, label, intrinsic, cam_model
+
+ def get_label_scale_factor(self, image, intrinsic, resize_ratio):
+ ori_h, ori_w, _ = image.shape
+ # crop_h, crop_w = self.crop_size
+ ori_focal = intrinsic[0]
+
+ to_canonical_ratio = self.canonical_focal / ori_focal
+ to_scale_ratio = resize_ratio / to_canonical_ratio
+ return to_scale_ratio
+
+ def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
+ target_h, target_w, _ = images[0].shape
+ resize_ratio_h = self.resize_h / target_h
+ resize_ratio_w = self.resize_w / target_w
+ resize_ratio = min(resize_ratio_h, resize_ratio_w)
+ reshape_h = int(resize_ratio * target_h)
+ reshape_w = int(resize_ratio * target_w)
+ pad_h = max(self.resize_h - reshape_h, 0)
+ pad_w = max(self.resize_w - reshape_w, 0)
+ pad_h_half = int(pad_h / 2)
+ pad_w_half = int(pad_w / 2)
+
+ pad_info = [pad_h, pad_w, pad_h_half, pad_w_half]
+ to_scale_ratio = self.get_label_scale_factor(images[0], intrinsics[0], resize_ratio)
+
+ for i in range(len(images)):
+ img = images[i]
+ label = labels[i] if i < len(labels) else None
+ intrinsic = intrinsics[i] if i < len(intrinsics) else None
+ cam_model = cam_models[i] if cam_models is not None and i < len(cam_models) else None
+ img, label, intrinsic, cam_model = self.main_data_transform(
+ img, label, intrinsic, cam_model, resize_ratio, pad_info, to_scale_ratio)
+ images[i] = img
+ if label is not None:
+ labels[i] = label
+ if intrinsic is not None:
+ intrinsics[i] = intrinsic
+ if cam_model is not None:
+ cam_models[i] = cam_model
+
+ if other_labels is not None:
+
+ for i, other_lab in enumerate(other_labels):
+ # resize
+ other_lab = cv2.resize(other_lab, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_NEAREST)
+ # pad
+ other_labels[i] = cv2.copyMakeBorder(
+ other_lab,
+ pad_h_half,
+ pad_h - pad_h_half,
+ pad_w_half,
+ pad_w - pad_w_half,
+ cv2.BORDER_CONSTANT,
+ value=self.ignore_label)
+
+ pad = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
+ if transform_paras is not None:
+ pad_old = transform_paras['pad'] if 'pad' in transform_paras else [0,0,0,0]
+ new_pad = [pad_old[0] + pad[0], pad_old[1] + pad[1], pad_old[2] + pad[2], pad_old[3] + pad[3]]
+ transform_paras.update(dict(pad=new_pad))
+ if 'label_scale_factor' in transform_paras:
+ transform_paras['label_scale_factor'] = transform_paras['label_scale_factor'] * 1.0 / to_scale_ratio
+ else:
+ transform_paras.update(label_scale_factor=1.0/to_scale_ratio)
+ return images, labels, intrinsics, cam_models, other_labels, transform_paras
+
+
+class BGR2RGB(object):
+ # Converts image from BGR order to RGB order, for model initialized from Pytorch
+ def __init__(self, **kwargs):
+ return
+ def __call__(self, images, labels, intrinsics, cam_models=None,other_labels=None, transform_paras=None):
+ for i, img in enumerate(images):
+ images[i] = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+ return images, labels, intrinsics, cam_models, other_labels, transform_paras
+
+
+def resize_depth_preserve(depth, shape):
+ """
+ Resizes depth map preserving all valid depth pixels
+ Multiple downsampled points can be assigned to the same pixel.
+
+ Parameters
+ ----------
+ depth : np.array [h,w]
+ Depth map
+ shape : tuple (H,W)
+ Output shape
+
+ Returns
+ -------
+ depth : np.array [H,W,1]
+ Resized depth map
+ """
+ # Store dimensions and reshapes to single column
+ depth = np.squeeze(depth)
+ h, w = depth.shape
+ x = depth.reshape(-1)
+ # Create coordinate grid
+ uv = np.mgrid[:h, :w].transpose(1, 2, 0).reshape(-1, 2)
+ # Filters valid points
+ idx = x > 0
+ crd, val = uv[idx], x[idx]
+ # Downsamples coordinates
+ crd[:, 0] = (crd[:, 0] * (shape[0] / h) + 0.5).astype(np.int32)
+ crd[:, 1] = (crd[:, 1] * (shape[1] / w) + 0.5).astype(np.int32)
+ # Filters points inside image
+ idx = (crd[:, 0] < shape[0]) & (crd[:, 1] < shape[1])
+ crd, val = crd[idx], val[idx]
+ # Creates downsampled depth image and assigns points
+ depth = np.zeros(shape)
+ depth[crd[:, 0], crd[:, 1]] = val
+ # Return resized depth map
+ return depth
+
+
+def build_camera_model(H : int, W : int, intrinsics : list) -> np.array:
+ """
+ Encode the camera intrinsic parameters (focal length and principle point) to a 4-channel map.
+ """
+ fx, fy, u0, v0 = intrinsics
+ f = (fx + fy) / 2.0
+ # principle point location
+ x_row = np.arange(0, W).astype(np.float32)
+ x_row_center_norm = (x_row - u0) / W
+ x_center = np.tile(x_row_center_norm, (H, 1)) # [H, W]
+
+ y_col = np.arange(0, H).astype(np.float32)
+ y_col_center_norm = (y_col - v0) / H
+ y_center = np.tile(y_col_center_norm, (W, 1)).T
+
+ # FoV
+ fov_x = np.arctan(x_center / (f / W))
+ fov_y = np.arctan(y_center/ (f / H))
+
+ cam_model = np.stack([x_center, y_center, fov_x, fov_y], axis=2)
+ return cam_model
+
+def gray_to_colormap(img, cmap='rainbow'):
+ """
+ Transfer gray map to matplotlib colormap
+ """
+ assert img.ndim == 2
+
+ img[img<0] = 0
+ mask_invalid = img < 1e-10
+ img = img / (img.max() + 1e-8)
+ norm = matplotlib.colors.Normalize(vmin=0, vmax=1.1)
+ cmap_m = matplotlib.cm.get_cmap(cmap)
+ map = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap_m)
+ colormap = (map.to_rgba(img)[:, :, :3] * 255).astype(np.uint8)
+ colormap[mask_invalid] = 0
+ return colormap
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/unproj_pcd.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/unproj_pcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5ba991497f5c6b56e4cdf29a1dfc8dbab2f81e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/unproj_pcd.py
@@ -0,0 +1,88 @@
+import numpy as np
+import torch
+from plyfile import PlyData, PlyElement
+import cv2
+
+
+def get_pcd_base(H, W, u0, v0, fx, fy):
+ x_row = np.arange(0, W)
+ x = np.tile(x_row, (H, 1))
+ x = x.astype(np.float32)
+ u_m_u0 = x - u0
+
+ y_col = np.arange(0, H) # y_col = np.arange(0, height)
+ y = np.tile(y_col, (W, 1)).T
+ y = y.astype(np.float32)
+ v_m_v0 = y - v0
+
+ x = u_m_u0 / fx
+ y = v_m_v0 / fy
+ z = np.ones_like(x)
+ pw = np.stack([x, y, z], axis=2) # [h, w, c]
+ return pw
+
+
+def reconstruct_pcd(depth, fx, fy, u0, v0, pcd_base=None, mask=None):
+ if type(depth) == torch.__name__:
+ depth = depth.cpu().numpy().squeeze()
+ depth = cv2.medianBlur(depth, 5)
+ if pcd_base is None:
+ H, W = depth.shape
+ pcd_base = get_pcd_base(H, W, u0, v0, fx, fy)
+ pcd = depth[:, :, None] * pcd_base
+ if mask:
+ pcd[mask] = 0
+ return pcd
+
+
+def save_point_cloud(pcd, rgb, filename, binary=True):
+ """Save an RGB point cloud as a PLY file.
+ :paras
+ @pcd: Nx3 matrix, the XYZ coordinates
+ @rgb: Nx3 matrix, the rgb colors for each 3D point
+ """
+ assert pcd.shape[0] == rgb.shape[0]
+
+ if rgb is None:
+ gray_concat = np.tile(np.array([128], dtype=np.uint8),
+ (pcd.shape[0], 3))
+ points_3d = np.hstack((pcd, gray_concat))
+ else:
+ points_3d = np.hstack((pcd, rgb))
+ python_types = (float, float, float, int, int, int)
+ npy_types = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'),
+ ('green', 'u1'), ('blue', 'u1')]
+ if binary is True:
+ # Format into Numpy structured array
+ vertices = []
+ for row_idx in range(points_3d.shape[0]):
+ cur_point = points_3d[row_idx]
+ vertices.append(
+ tuple(
+ dtype(point)
+ for dtype, point in zip(python_types, cur_point)))
+ vertices_array = np.array(vertices, dtype=npy_types)
+ el = PlyElement.describe(vertices_array, 'vertex')
+
+ # write
+ PlyData([el]).write(filename)
+ else:
+ x = np.squeeze(points_3d[:, 0])
+ y = np.squeeze(points_3d[:, 1])
+ z = np.squeeze(points_3d[:, 2])
+ r = np.squeeze(points_3d[:, 3])
+ g = np.squeeze(points_3d[:, 4])
+ b = np.squeeze(points_3d[:, 5])
+
+ ply_head = 'ply\n' \
+ 'format ascii 1.0\n' \
+ 'element vertex %d\n' \
+ 'property float x\n' \
+ 'property float y\n' \
+ 'property float z\n' \
+ 'property uchar red\n' \
+ 'property uchar green\n' \
+ 'property uchar blue\n' \
+ 'end_header' % r.shape[0]
+ # ---- Save ply data to disk
+ np.savetxt(filename, np.column_stack[x, y, z, r, g, b], fmt='%f %f %f %d %d %d', header=ply_head, comments='')
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/visualization.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f8ef609192e9c04cf289e98bf64880030025de7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/metric3d/mono/utils/visualization.py
@@ -0,0 +1,139 @@
+import matplotlib.pyplot as plt
+import os, cv2
+import numpy as np
+from custom_controlnet_aux.metric3d.mono.utils.transform import gray_to_colormap
+import shutil
+import glob
+from custom_controlnet_aux.metric3d.mono.utils.running import main_process
+import torch
+
+def save_raw_imgs(
+ pred: torch.tensor,
+ rgb: torch.tensor,
+ filename: str,
+ save_dir: str,
+ scale: float=200.0,
+ target: torch.tensor=None,
+ ):
+ """
+ Save raw GT, predictions, RGB in the same file.
+ """
+ cv2.imwrite(os.path.join(save_dir, filename[:-4]+'_rgb.jpg'), rgb)
+ cv2.imwrite(os.path.join(save_dir, filename[:-4]+'_d.png'), (pred*scale).astype(np.uint16))
+ if target is not None:
+ cv2.imwrite(os.path.join(save_dir, filename[:-4]+'_gt.png'), (target*scale).astype(np.uint16))
+
+
+def save_val_imgs(
+ iter: int,
+ pred: torch.tensor,
+ target: torch.tensor,
+ rgb: torch.tensor,
+ filename: str,
+ save_dir: str,
+ tb_logger=None
+ ):
+ """
+ Save GT, predictions, RGB in the same file.
+ """
+ rgb, pred_scale, target_scale, pred_color, target_color = get_data_for_log(pred, target, rgb)
+ rgb = rgb.transpose((1, 2, 0))
+ cat_img = np.concatenate([rgb, pred_color, target_color], axis=0)
+ plt.imsave(os.path.join(save_dir, filename[:-4]+'_merge.jpg'), cat_img)
+
+ # save to tensorboard
+ if tb_logger is not None:
+ tb_logger.add_image(f'{filename[:-4]}_merge.jpg', cat_img.transpose((2, 0, 1)), iter)
+
+def save_normal_val_imgs(
+ iter: int,
+ pred: torch.tensor,
+ targ: torch.tensor,
+ rgb: torch.tensor,
+ filename: str,
+ save_dir: str,
+ tb_logger=None,
+ mask=None,
+ ):
+ """
+ Save GT, predictions, RGB in the same file.
+ """
+ mean = np.array([123.675, 116.28, 103.53])[np.newaxis, np.newaxis, :]
+ std= np.array([58.395, 57.12, 57.375])[np.newaxis, np.newaxis, :]
+ pred = pred.squeeze()
+ targ = targ.squeeze()
+ rgb = rgb.squeeze()
+
+ if pred.size(0) == 3:
+ pred = pred.permute(1,2,0)
+ if targ.size(0) == 3:
+ targ = targ.permute(1,2,0)
+ if rgb.size(0) == 3:
+ rgb = rgb.permute(1,2,0)
+
+ pred_color = vis_surface_normal(pred, mask)
+ targ_color = vis_surface_normal(targ, mask)
+ rgb_color = ((rgb.cpu().numpy() * std) + mean).astype(np.uint8)
+
+ try:
+ cat_img = np.concatenate([rgb_color, pred_color, targ_color], axis=0)
+ except:
+ pred_color = cv2.resize(pred_color, (rgb.shape[1], rgb.shape[0]))
+ targ_color = cv2.resize(targ_color, (rgb.shape[1], rgb.shape[0]))
+ cat_img = np.concatenate([rgb_color, pred_color, targ_color], axis=0)
+
+ plt.imsave(os.path.join(save_dir, filename[:-4]+'_merge.jpg'), cat_img)
+ # cv2.imwrite(os.path.join(save_dir, filename[:-4]+'.jpg'), pred_color)
+ # save to tensorboard
+ if tb_logger is not None:
+ tb_logger.add_image(f'{filename[:-4]}_merge.jpg', cat_img.transpose((2, 0, 1)), iter)
+
+def get_data_for_log(pred: torch.tensor, target: torch.tensor, rgb: torch.tensor):
+ mean = np.array([123.675, 116.28, 103.53])[:, np.newaxis, np.newaxis]
+ std= np.array([58.395, 57.12, 57.375])[:, np.newaxis, np.newaxis]
+
+ pred = pred.squeeze().cpu().numpy()
+ target = target.squeeze().cpu().numpy()
+ rgb = rgb.squeeze().cpu().numpy()
+
+ pred[pred<0] = 0
+ target[target<0] = 0
+ max_scale = max(pred.max(), target.max())
+ pred_scale = (pred/max_scale * 10000).astype(np.uint16)
+ target_scale = (target/max_scale * 10000).astype(np.uint16)
+ pred_color = gray_to_colormap(pred)
+ target_color = gray_to_colormap(target)
+ pred_color = cv2.resize(pred_color, (rgb.shape[2], rgb.shape[1]))
+ target_color = cv2.resize(target_color, (rgb.shape[2], rgb.shape[1]))
+
+ rgb = ((rgb * std) + mean).astype(np.uint8)
+ return rgb, pred_scale, target_scale, pred_color, target_color
+
+
+def create_html(name2path, save_path='index.html', size=(256, 384)):
+ # table description
+ cols = []
+ for k, v in name2path.items():
+ col_i = Col('img', k, v) # specify image content for column
+ cols.append(col_i)
+ # html table generation
+ imagetable(cols, out_file=save_path, imsize=size)
+
+def vis_surface_normal(normal: torch.tensor, mask: torch.tensor=None) -> np.array:
+ """
+ Visualize surface normal. Transfer surface normal value from [-1, 1] to [0, 255]
+ Aargs:
+ normal (torch.tensor, [h, w, 3]): surface normal
+ mask (torch.tensor, [h, w]): valid masks
+ """
+ normal = normal.cpu().numpy().squeeze()
+ n_img_L2 = np.sqrt(np.sum(normal ** 2, axis=2, keepdims=True))
+ n_img_norm = normal / (n_img_L2 + 1e-8)
+ normal_vis = n_img_norm * 127
+ normal_vis += 128
+ normal_vis = normal_vis.astype(np.uint8)
+ if mask is not None:
+ mask = mask.cpu().numpy().squeeze()
+ normal_vis[~mask] = 0
+ return normal_vis
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0365733785a449c285c6ac704ef443f385fe798c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e2093a203ca2cd128f0d8c8033402c978251043
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/__init__.py
@@ -0,0 +1,76 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+from .api import MiDaSInference
+
+
+class MidasDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, model_type="dpt_hybrid", filename="dpt_hybrid-midas-501f0c75.pt"):
+ subfolder = "annotator/ckpts" if pretrained_model_or_path == "lllyasviel/ControlNet" else ''
+ model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder=subfolder)
+ model = MiDaSInference(model_type=model_type, model_path=model_path)
+ return cls(model)
+
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1, depth_and_normal=False, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ image_depth = detected_map
+ with torch.no_grad():
+ image_depth = torch.from_numpy(image_depth).float()
+ image_depth = image_depth.to(self.device)
+ image_depth = image_depth / 127.5 - 1.0
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
+ depth = self.model(image_depth)[0]
+
+ depth_pt = depth.clone()
+ depth_pt -= torch.min(depth_pt)
+ depth_pt /= torch.max(depth_pt)
+ depth_pt = depth_pt.cpu().numpy()
+ depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
+
+ if depth_and_normal:
+ depth_np = depth.cpu().numpy()
+ x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
+ y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
+ z = np.ones_like(x) * a
+ x[depth_pt < bg_th] = 0
+ y[depth_pt < bg_th] = 0
+ normal = np.stack([x, y, z], axis=2)
+ normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5
+ normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)[:, :, ::-1]
+
+ depth_image = HWC3(depth_image)
+ if depth_and_normal:
+ normal_image = HWC3(normal_image)
+
+
+ depth_image = remove_pad(depth_image)
+ if depth_and_normal:
+ normal_image = remove_pad(normal_image)
+
+ if output_type == "pil":
+ depth_image = Image.fromarray(depth_image)
+ if depth_and_normal:
+ normal_image = Image.fromarray(normal_image)
+
+ if depth_and_normal:
+ return depth_image, normal_image
+ else:
+ return depth_image
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/api.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c18e148e4b497cfad922d1c49a0a346f53aebde
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/api.py
@@ -0,0 +1,169 @@
+# based on https://github.com/isl-org/MiDaS
+
+import cv2
+import os
+import torch
+import torch.nn as nn
+from torchvision.transforms import Compose
+
+from custom_midas_repo.midas.dpt_depth import DPTDepthModel
+from custom_midas_repo.midas.midas_net import MidasNet
+from custom_midas_repo.midas.midas_net_custom import MidasNet_small
+from custom_midas_repo.midas.transforms import Resize, NormalizeImage, PrepareForNet
+from custom_controlnet_aux.util import annotator_ckpts_path
+
+
+ISL_PATHS = {
+ "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
+ "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
+ "midas_v21": "",
+ "midas_v21_small": "",
+}
+
+remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
+
+
+def disabled_train(self, mode=True):
+ """Overwrite model.train with this function to make sure train/eval mode
+ does not change anymore."""
+ return self
+
+
+def load_midas_transform(model_type):
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
+ # load transform only
+ if model_type == "dpt_large": # DPT-Large
+ net_w, net_h = 384, 384
+ resize_mode = "minimal"
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
+ net_w, net_h = 384, 384
+ resize_mode = "minimal"
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+ elif model_type == "midas_v21":
+ net_w, net_h = 384, 384
+ resize_mode = "upper_bound"
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+ elif model_type == "midas_v21_small":
+ net_w, net_h = 256, 256
+ resize_mode = "upper_bound"
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+ else:
+ assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
+
+ transform = Compose(
+ [
+ Resize(
+ net_w,
+ net_h,
+ resize_target=None,
+ keep_aspect_ratio=True,
+ ensure_multiple_of=32,
+ resize_method=resize_mode,
+ image_interpolation_method=cv2.INTER_CUBIC,
+ ),
+ normalization,
+ PrepareForNet(),
+ ]
+ )
+
+ return transform
+
+
+def load_model(model_type, model_path=None):
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
+ # load network
+ model_path = model_path or ISL_PATHS[model_type]
+ if model_type == "dpt_large": # DPT-Large
+ model = DPTDepthModel(
+ path=model_path,
+ backbone="vitl16_384",
+ non_negative=True,
+ )
+ net_w, net_h = 384, 384
+ resize_mode = "minimal"
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
+ if not os.path.exists(model_path):
+ from basicsr.utils.download_util import load_file_from_url
+ load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
+
+ model = DPTDepthModel(
+ path=model_path,
+ backbone="vitb_rn50_384",
+ non_negative=True,
+ )
+ net_w, net_h = 384, 384
+ resize_mode = "minimal"
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+ elif model_type == "midas_v21":
+ model = MidasNet(model_path, non_negative=True)
+ net_w, net_h = 384, 384
+ resize_mode = "upper_bound"
+ normalization = NormalizeImage(
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+ )
+
+ elif model_type == "midas_v21_small":
+ model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
+ non_negative=True, blocks={'expand': True})
+ net_w, net_h = 256, 256
+ resize_mode = "upper_bound"
+ normalization = NormalizeImage(
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+ )
+
+ else:
+ print(f"model_type '{model_type}' not implemented, use: --model_type large")
+ assert False
+
+ transform = Compose(
+ [
+ Resize(
+ net_w,
+ net_h,
+ resize_target=None,
+ keep_aspect_ratio=True,
+ ensure_multiple_of=32,
+ resize_method=resize_mode,
+ image_interpolation_method=cv2.INTER_CUBIC,
+ ),
+ normalization,
+ PrepareForNet(),
+ ]
+ )
+
+ return model.eval(), transform
+
+
+class MiDaSInference(nn.Module):
+ MODEL_TYPES_TORCH_HUB = [
+ "DPT_Large",
+ "DPT_Hybrid",
+ "MiDaS_small"
+ ]
+ MODEL_TYPES_ISL = [
+ "dpt_large",
+ "dpt_hybrid",
+ "midas_v21",
+ "midas_v21_small",
+ ]
+
+ def __init__(self, model_type, model_path):
+ super().__init__()
+ assert (model_type in self.MODEL_TYPES_ISL)
+ model, _ = load_model(model_type, model_path)
+ self.model = model
+ self.model.train = disabled_train
+
+ def forward(self, x):
+ with torch.no_grad():
+ prediction = self.model(x)
+ return prediction
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2db077a2c3646ece2dbbfd555de6ac4efb4590e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/midas/utils.py
@@ -0,0 +1,189 @@
+"""Utils for monoDepth."""
+import sys
+import re
+import numpy as np
+import cv2
+import torch
+
+
+def read_pfm(path):
+ """Read pfm file.
+
+ Args:
+ path (str): path to file
+
+ Returns:
+ tuple: (data, scale)
+ """
+ with open(path, "rb") as file:
+
+ color = None
+ width = None
+ height = None
+ scale = None
+ endian = None
+
+ header = file.readline().rstrip()
+ if header.decode("ascii") == "PF":
+ color = True
+ elif header.decode("ascii") == "Pf":
+ color = False
+ else:
+ raise Exception("Not a PFM file: " + path)
+
+ dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+ if dim_match:
+ width, height = list(map(int, dim_match.groups()))
+ else:
+ raise Exception("Malformed PFM header.")
+
+ scale = float(file.readline().decode("ascii").rstrip())
+ if scale < 0:
+ # little-endian
+ endian = "<"
+ scale = -scale
+ else:
+ # big-endian
+ endian = ">"
+
+ data = np.fromfile(file, endian + "f")
+ shape = (height, width, 3) if color else (height, width)
+
+ data = np.reshape(data, shape)
+ data = np.flipud(data)
+
+ return data, scale
+
+
+def write_pfm(path, image, scale=1):
+ """Write pfm file.
+
+ Args:
+ path (str): pathto file
+ image (array): data
+ scale (int, optional): Scale. Defaults to 1.
+ """
+
+ with open(path, "wb") as file:
+ color = None
+
+ if image.dtype.name != "float32":
+ raise Exception("Image dtype must be float32.")
+
+ image = np.flipud(image)
+
+ if len(image.shape) == 3 and image.shape[2] == 3: # color image
+ color = True
+ elif (
+ len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
+ ): # greyscale
+ color = False
+ else:
+ raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+
+ file.write("PF\n" if color else "Pf\n".encode())
+ file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+
+ endian = image.dtype.byteorder
+
+ if endian == "<" or endian == "=" and sys.byteorder == "little":
+ scale = -scale
+
+ file.write("%f\n".encode() % scale)
+
+ image.tofile(file)
+
+
+def read_image(path):
+ """Read image and output RGB image (0-1).
+
+ Args:
+ path (str): path to file
+
+ Returns:
+ array: RGB image (0-1)
+ """
+ img = cv2.imread(path)
+
+ if img.ndim == 2:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
+
+ return img
+
+
+def resize_image(img):
+ """Resize image and make it fit for network.
+
+ Args:
+ img (array): image
+
+ Returns:
+ tensor: data ready for network
+ """
+ height_orig = img.shape[0]
+ width_orig = img.shape[1]
+
+ if width_orig > height_orig:
+ scale = width_orig / 384
+ else:
+ scale = height_orig / 384
+
+ height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
+ width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
+
+ img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
+
+ img_resized = (
+ torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
+ )
+ img_resized = img_resized.unsqueeze(0)
+
+ return img_resized
+
+
+def resize_depth(depth, width, height):
+ """Resize depth map and bring to CPU (numpy).
+
+ Args:
+ depth (tensor): depth
+ width (int): image width
+ height (int): image height
+
+ Returns:
+ array: processed depth
+ """
+ depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
+
+ depth_resized = cv2.resize(
+ depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
+ )
+
+ return depth_resized
+
+def write_depth(path, depth, bits=1):
+ """Write depth map to pfm and png file.
+
+ Args:
+ path (str): filepath without extension
+ depth (array): depth
+ """
+ write_pfm(path + ".pfm", depth.astype(np.float32))
+
+ depth_min = depth.min()
+ depth_max = depth.max()
+
+ max_val = (2**(8*bits))-1
+
+ if depth_max - depth_min > np.finfo("float").eps:
+ out = max_val * (depth - depth_min) / (depth_max - depth_min)
+ else:
+ out = np.zeros(depth.shape, dtype=depth.type)
+
+ if bits == 1:
+ cv2.imwrite(path + ".png", out.astype("uint8"))
+ elif bits == 2:
+ cv2.imwrite(path + ".png", out.astype("uint16"))
+
+ return
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..8f196256662cec210c5993c51720dbc12c958d49
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2021-present NAVER Corp.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbdfc495f0caf3151d1640fa61f28d877856dc82
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/__init__.py
@@ -0,0 +1,51 @@
+import os
+import warnings
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+from .models.mbv2_mlsd_large import MobileV2_MLSD_Large
+from .utils import pred_lines
+
+
+class MLSDdetector:
+ def __init__(self, model):
+ self.model = model
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="mlsd_large_512_fp32.pth"):
+ subfolder = "annotator/ckpts" if pretrained_model_or_path == "lllyasviel/ControlNet" else ''
+ model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder=subfolder)
+ model = MobileV2_MLSD_Large()
+ model.load_state_dict(torch.load(model_path), strict=True)
+ model.eval()
+
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ return self
+
+ def __call__(self, input_image, thr_v=0.1, thr_d=0.1, detect_resolution=512, output_type="pil", upscale_method="INTER_AREA", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ img = detected_map
+ img_output = np.zeros_like(img)
+ try:
+ with torch.no_grad():
+ lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
+ for line in lines:
+ x_start, y_start, x_end, y_end = [int(val) for val in line]
+ cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
+ except Exception as e:
+ pass
+
+ detected_map = remove_pad(HWC3(img_output[:, :, 0]))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/mbv2_mlsd_large.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/mbv2_mlsd_large.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e31f091ab1d56a837634c93920004f542d4393
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/mbv2_mlsd_large.py
@@ -0,0 +1,292 @@
+import os
+import sys
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+from torch.nn import functional as F
+
+
+class BlockTypeA(nn.Module):
+ def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
+ super(BlockTypeA, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(in_c2, out_c2, kernel_size=1),
+ nn.BatchNorm2d(out_c2),
+ nn.ReLU(inplace=True)
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_c1, out_c1, kernel_size=1),
+ nn.BatchNorm2d(out_c1),
+ nn.ReLU(inplace=True)
+ )
+ self.upscale = upscale
+
+ def forward(self, a, b):
+ b = self.conv1(b)
+ a = self.conv2(a)
+ if self.upscale:
+ b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
+ return torch.cat((a, b), dim=1)
+
+
+class BlockTypeB(nn.Module):
+ def __init__(self, in_c, out_c):
+ super(BlockTypeB, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+ nn.BatchNorm2d(in_c),
+ nn.ReLU()
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
+ nn.BatchNorm2d(out_c),
+ nn.ReLU()
+ )
+
+ def forward(self, x):
+ x = self.conv1(x) + x
+ x = self.conv2(x)
+ return x
+
+class BlockTypeC(nn.Module):
+ def __init__(self, in_c, out_c):
+ super(BlockTypeC, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
+ nn.BatchNorm2d(in_c),
+ nn.ReLU()
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+ nn.BatchNorm2d(in_c),
+ nn.ReLU()
+ )
+ self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.conv3(x)
+ return x
+
+def _make_divisible(v, divisor, min_value=None):
+ """
+ This function is taken from the original tf repo.
+ It ensures that all layers have a channel number that is divisible by 8
+ It can be seen here:
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+ :param v:
+ :param divisor:
+ :param min_value:
+ :return:
+ """
+ if min_value is None:
+ min_value = divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ # Make sure that round down does not go down by more than 10%.
+ if new_v < 0.9 * v:
+ new_v += divisor
+ return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+ self.channel_pad = out_planes - in_planes
+ self.stride = stride
+ #padding = (kernel_size - 1) // 2
+
+ # TFLite uses slightly different padding than PyTorch
+ if stride == 2:
+ padding = 0
+ else:
+ padding = (kernel_size - 1) // 2
+
+ super(ConvBNReLU, self).__init__(
+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+ nn.BatchNorm2d(out_planes),
+ nn.ReLU6(inplace=True)
+ )
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+
+
+ def forward(self, x):
+ # TFLite uses different padding
+ if self.stride == 2:
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+ #print(x.shape)
+
+ for module in self:
+ if not isinstance(module, nn.MaxPool2d):
+ x = module(x)
+ return x
+
+
+class InvertedResidual(nn.Module):
+ def __init__(self, inp, oup, stride, expand_ratio):
+ super(InvertedResidual, self).__init__()
+ self.stride = stride
+ assert stride in [1, 2]
+
+ hidden_dim = int(round(inp * expand_ratio))
+ self.use_res_connect = self.stride == 1 and inp == oup
+
+ layers = []
+ if expand_ratio != 1:
+ # pw
+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+ layers.extend([
+ # dw
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+ # pw-linear
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(oup),
+ ])
+ self.conv = nn.Sequential(*layers)
+
+ def forward(self, x):
+ if self.use_res_connect:
+ return x + self.conv(x)
+ else:
+ return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+ def __init__(self, pretrained=True):
+ """
+ MobileNet V2 main class
+ Args:
+ num_classes (int): Number of classes
+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+ inverted_residual_setting: Network structure
+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+ Set to 1 to turn off rounding
+ block: Module specifying inverted residual building block for mobilenet
+ """
+ super(MobileNetV2, self).__init__()
+
+ block = InvertedResidual
+ input_channel = 32
+ last_channel = 1280
+ width_mult = 1.0
+ round_nearest = 8
+
+ inverted_residual_setting = [
+ # t, c, n, s
+ [1, 16, 1, 1],
+ [6, 24, 2, 2],
+ [6, 32, 3, 2],
+ [6, 64, 4, 2],
+ [6, 96, 3, 1],
+ #[6, 160, 3, 2],
+ #[6, 320, 1, 1],
+ ]
+
+ # only check the first element, assuming user knows t,c,n,s are required
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+ raise ValueError("inverted_residual_setting should be non-empty "
+ "or a 4-element list, got {}".format(inverted_residual_setting))
+
+ # building first layer
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+ features = [ConvBNReLU(4, input_channel, stride=2)]
+ # building inverted residual blocks
+ for t, c, n, s in inverted_residual_setting:
+ output_channel = _make_divisible(c * width_mult, round_nearest)
+ for i in range(n):
+ stride = s if i == 0 else 1
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+ input_channel = output_channel
+
+ self.features = nn.Sequential(*features)
+ self.fpn_selected = [1, 3, 6, 10, 13]
+ # weight initialization
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.ones_(m.weight)
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.Linear):
+ nn.init.normal_(m.weight, 0, 0.01)
+ nn.init.zeros_(m.bias)
+ if pretrained:
+ self._load_pretrained_model()
+
+ def _forward_impl(self, x):
+ # This exists since TorchScript doesn't support inheritance, so the superclass method
+ # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+ fpn_features = []
+ for i, f in enumerate(self.features):
+ if i > self.fpn_selected[-1]:
+ break
+ x = f(x)
+ if i in self.fpn_selected:
+ fpn_features.append(x)
+
+ c1, c2, c3, c4, c5 = fpn_features
+ return c1, c2, c3, c4, c5
+
+
+ def forward(self, x):
+ return self._forward_impl(x)
+
+ def _load_pretrained_model(self):
+ pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
+ model_dict = {}
+ state_dict = self.state_dict()
+ for k, v in pretrain_dict.items():
+ if k in state_dict:
+ model_dict[k] = v
+ state_dict.update(model_dict)
+ self.load_state_dict(state_dict)
+
+
+class MobileV2_MLSD_Large(nn.Module):
+ def __init__(self):
+ super(MobileV2_MLSD_Large, self).__init__()
+
+ self.backbone = MobileNetV2(pretrained=False)
+ ## A, B
+ self.block15 = BlockTypeA(in_c1= 64, in_c2= 96,
+ out_c1= 64, out_c2=64,
+ upscale=False)
+ self.block16 = BlockTypeB(128, 64)
+
+ ## A, B
+ self.block17 = BlockTypeA(in_c1 = 32, in_c2 = 64,
+ out_c1= 64, out_c2= 64)
+ self.block18 = BlockTypeB(128, 64)
+
+ ## A, B
+ self.block19 = BlockTypeA(in_c1=24, in_c2=64,
+ out_c1=64, out_c2=64)
+ self.block20 = BlockTypeB(128, 64)
+
+ ## A, B, C
+ self.block21 = BlockTypeA(in_c1=16, in_c2=64,
+ out_c1=64, out_c2=64)
+ self.block22 = BlockTypeB(128, 64)
+
+ self.block23 = BlockTypeC(64, 16)
+
+ def forward(self, x):
+ c1, c2, c3, c4, c5 = self.backbone(x)
+
+ x = self.block15(c4, c5)
+ x = self.block16(x)
+
+ x = self.block17(c3, x)
+ x = self.block18(x)
+
+ x = self.block19(c2, x)
+ x = self.block20(x)
+
+ x = self.block21(c1, x)
+ x = self.block22(x)
+ x = self.block23(x)
+ x = x[:, 7:, :, :]
+
+ return x
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/mbv2_mlsd_tiny.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/mbv2_mlsd_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e01a437a7d277c39a3ccea0c644a954ad6d9848
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/models/mbv2_mlsd_tiny.py
@@ -0,0 +1,275 @@
+import os
+import sys
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+from torch.nn import functional as F
+
+
+class BlockTypeA(nn.Module):
+ def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
+ super(BlockTypeA, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(in_c2, out_c2, kernel_size=1),
+ nn.BatchNorm2d(out_c2),
+ nn.ReLU(inplace=True)
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_c1, out_c1, kernel_size=1),
+ nn.BatchNorm2d(out_c1),
+ nn.ReLU(inplace=True)
+ )
+ self.upscale = upscale
+
+ def forward(self, a, b):
+ b = self.conv1(b)
+ a = self.conv2(a)
+ b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
+ return torch.cat((a, b), dim=1)
+
+
+class BlockTypeB(nn.Module):
+ def __init__(self, in_c, out_c):
+ super(BlockTypeB, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+ nn.BatchNorm2d(in_c),
+ nn.ReLU()
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
+ nn.BatchNorm2d(out_c),
+ nn.ReLU()
+ )
+
+ def forward(self, x):
+ x = self.conv1(x) + x
+ x = self.conv2(x)
+ return x
+
+class BlockTypeC(nn.Module):
+ def __init__(self, in_c, out_c):
+ super(BlockTypeC, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
+ nn.BatchNorm2d(in_c),
+ nn.ReLU()
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+ nn.BatchNorm2d(in_c),
+ nn.ReLU()
+ )
+ self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.conv3(x)
+ return x
+
+def _make_divisible(v, divisor, min_value=None):
+ """
+ This function is taken from the original tf repo.
+ It ensures that all layers have a channel number that is divisible by 8
+ It can be seen here:
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+ :param v:
+ :param divisor:
+ :param min_value:
+ :return:
+ """
+ if min_value is None:
+ min_value = divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ # Make sure that round down does not go down by more than 10%.
+ if new_v < 0.9 * v:
+ new_v += divisor
+ return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+ self.channel_pad = out_planes - in_planes
+ self.stride = stride
+ #padding = (kernel_size - 1) // 2
+
+ # TFLite uses slightly different padding than PyTorch
+ if stride == 2:
+ padding = 0
+ else:
+ padding = (kernel_size - 1) // 2
+
+ super(ConvBNReLU, self).__init__(
+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+ nn.BatchNorm2d(out_planes),
+ nn.ReLU6(inplace=True)
+ )
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+
+
+ def forward(self, x):
+ # TFLite uses different padding
+ if self.stride == 2:
+ x = F.pad(x, (0, 1, 0, 1), "constant", 0)
+ #print(x.shape)
+
+ for module in self:
+ if not isinstance(module, nn.MaxPool2d):
+ x = module(x)
+ return x
+
+
+class InvertedResidual(nn.Module):
+ def __init__(self, inp, oup, stride, expand_ratio):
+ super(InvertedResidual, self).__init__()
+ self.stride = stride
+ assert stride in [1, 2]
+
+ hidden_dim = int(round(inp * expand_ratio))
+ self.use_res_connect = self.stride == 1 and inp == oup
+
+ layers = []
+ if expand_ratio != 1:
+ # pw
+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+ layers.extend([
+ # dw
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+ # pw-linear
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(oup),
+ ])
+ self.conv = nn.Sequential(*layers)
+
+ def forward(self, x):
+ if self.use_res_connect:
+ return x + self.conv(x)
+ else:
+ return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+ def __init__(self, pretrained=True):
+ """
+ MobileNet V2 main class
+ Args:
+ num_classes (int): Number of classes
+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+ inverted_residual_setting: Network structure
+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+ Set to 1 to turn off rounding
+ block: Module specifying inverted residual building block for mobilenet
+ """
+ super(MobileNetV2, self).__init__()
+
+ block = InvertedResidual
+ input_channel = 32
+ last_channel = 1280
+ width_mult = 1.0
+ round_nearest = 8
+
+ inverted_residual_setting = [
+ # t, c, n, s
+ [1, 16, 1, 1],
+ [6, 24, 2, 2],
+ [6, 32, 3, 2],
+ [6, 64, 4, 2],
+ #[6, 96, 3, 1],
+ #[6, 160, 3, 2],
+ #[6, 320, 1, 1],
+ ]
+
+ # only check the first element, assuming user knows t,c,n,s are required
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+ raise ValueError("inverted_residual_setting should be non-empty "
+ "or a 4-element list, got {}".format(inverted_residual_setting))
+
+ # building first layer
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+ features = [ConvBNReLU(4, input_channel, stride=2)]
+ # building inverted residual blocks
+ for t, c, n, s in inverted_residual_setting:
+ output_channel = _make_divisible(c * width_mult, round_nearest)
+ for i in range(n):
+ stride = s if i == 0 else 1
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+ input_channel = output_channel
+ self.features = nn.Sequential(*features)
+
+ self.fpn_selected = [3, 6, 10]
+ # weight initialization
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.ones_(m.weight)
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.Linear):
+ nn.init.normal_(m.weight, 0, 0.01)
+ nn.init.zeros_(m.bias)
+
+ #if pretrained:
+ # self._load_pretrained_model()
+
+ def _forward_impl(self, x):
+ # This exists since TorchScript doesn't support inheritance, so the superclass method
+ # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+ fpn_features = []
+ for i, f in enumerate(self.features):
+ if i > self.fpn_selected[-1]:
+ break
+ x = f(x)
+ if i in self.fpn_selected:
+ fpn_features.append(x)
+
+ c2, c3, c4 = fpn_features
+ return c2, c3, c4
+
+
+ def forward(self, x):
+ return self._forward_impl(x)
+
+ def _load_pretrained_model(self):
+ pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
+ model_dict = {}
+ state_dict = self.state_dict()
+ for k, v in pretrain_dict.items():
+ if k in state_dict:
+ model_dict[k] = v
+ state_dict.update(model_dict)
+ self.load_state_dict(state_dict)
+
+
+class MobileV2_MLSD_Tiny(nn.Module):
+ def __init__(self):
+ super(MobileV2_MLSD_Tiny, self).__init__()
+
+ self.backbone = MobileNetV2(pretrained=True)
+
+ self.block12 = BlockTypeA(in_c1= 32, in_c2= 64,
+ out_c1= 64, out_c2=64)
+ self.block13 = BlockTypeB(128, 64)
+
+ self.block14 = BlockTypeA(in_c1 = 24, in_c2 = 64,
+ out_c1= 32, out_c2= 32)
+ self.block15 = BlockTypeB(64, 64)
+
+ self.block16 = BlockTypeC(64, 16)
+
+ def forward(self, x):
+ c2, c3, c4 = self.backbone(x)
+
+ x = self.block12(c3, c4)
+ x = self.block13(x)
+ x = self.block14(c2, x)
+ x = self.block15(x)
+ x = self.block16(x)
+ x = x[:, 7:, :, :]
+ #print(x.shape)
+ x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True)
+
+ return x
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb7923609983d762637e216c89bdb07c978bdf9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/mlsd/utils.py
@@ -0,0 +1,584 @@
+'''
+modified by lihaoweicv
+pytorch version
+'''
+
+'''
+M-LSD
+Copyright 2021-present NAVER Corp.
+Apache License v2.0
+'''
+
+import os
+import numpy as np
+import cv2
+import torch
+from torch.nn import functional as F
+
+
+def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5):
+ '''
+ tpMap:
+ center: tpMap[1, 0, :, :]
+ displacement: tpMap[1, 1:5, :, :]
+ '''
+ b, c, h, w = tpMap.shape
+ assert b==1, 'only support bsize==1'
+ displacement = tpMap[:, 1:5, :, :][0]
+ center = tpMap[:, 0, :, :]
+ heat = torch.sigmoid(center)
+ hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2)
+ keep = (hmax == heat).float()
+ heat = heat * keep
+ heat = heat.reshape(-1, )
+
+ scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True)
+ yy = torch.floor_divide(indices, w).unsqueeze(-1)
+ xx = torch.fmod(indices, w).unsqueeze(-1)
+ ptss = torch.cat((yy, xx),dim=-1)
+
+ ptss = ptss.detach().cpu().numpy()
+ scores = scores.detach().cpu().numpy()
+ displacement = displacement.detach().cpu().numpy()
+ displacement = displacement.transpose((1,2,0))
+ return ptss, scores, displacement
+
+
+def pred_lines(image, model,
+ input_shape=[512, 512],
+ score_thr=0.10,
+ dist_thr=20.0):
+ h, w, _ = image.shape
+
+ device = next(iter(model.parameters())).device
+ h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
+
+ resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
+ np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
+
+ resized_image = resized_image.transpose((2,0,1))
+ batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
+ batch_image = (batch_image / 127.5) - 1.0
+
+ batch_image = torch.from_numpy(batch_image).float()
+ batch_image = batch_image.to(device)
+ outputs = model(batch_image)
+ pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
+ start = vmap[:, :, :2]
+ end = vmap[:, :, 2:]
+ dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
+
+ segments_list = []
+ for center, score in zip(pts, pts_score):
+ y, x = center
+ distance = dist_map[y, x]
+ if score > score_thr and distance > dist_thr:
+ disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
+ x_start = x + disp_x_start
+ y_start = y + disp_y_start
+ x_end = x + disp_x_end
+ y_end = y + disp_y_end
+ segments_list.append([x_start, y_start, x_end, y_end])
+
+ lines = 2 * np.array(segments_list) # 256 > 512
+ lines[:, 0] = lines[:, 0] * w_ratio
+ lines[:, 1] = lines[:, 1] * h_ratio
+ lines[:, 2] = lines[:, 2] * w_ratio
+ lines[:, 3] = lines[:, 3] * h_ratio
+
+ return lines
+
+
+def pred_squares(image,
+ model,
+ input_shape=[512, 512],
+ params={'score': 0.06,
+ 'outside_ratio': 0.28,
+ 'inside_ratio': 0.45,
+ 'w_overlap': 0.0,
+ 'w_degree': 1.95,
+ 'w_length': 0.0,
+ 'w_area': 1.86,
+ 'w_center': 0.14}):
+ '''
+ shape = [height, width]
+ '''
+ h, w, _ = image.shape
+ original_shape = [h, w]
+ device = next(iter(model.parameters())).device
+
+ resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA),
+ np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
+ resized_image = resized_image.transpose((2, 0, 1))
+ batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
+ batch_image = (batch_image / 127.5) - 1.0
+
+ batch_image = torch.from_numpy(batch_image).float().to(device)
+ outputs = model(batch_image)
+
+ pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
+ start = vmap[:, :, :2] # (x, y)
+ end = vmap[:, :, 2:] # (x, y)
+ dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
+
+ junc_list = []
+ segments_list = []
+ for junc, score in zip(pts, pts_score):
+ y, x = junc
+ distance = dist_map[y, x]
+ if score > params['score'] and distance > 20.0:
+ junc_list.append([x, y])
+ disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
+ d_arrow = 1.0
+ x_start = x + d_arrow * disp_x_start
+ y_start = y + d_arrow * disp_y_start
+ x_end = x + d_arrow * disp_x_end
+ y_end = y + d_arrow * disp_y_end
+ segments_list.append([x_start, y_start, x_end, y_end])
+
+ segments = np.array(segments_list)
+
+ ####### post processing for squares
+ # 1. get unique lines
+ point = np.array([[0, 0]])
+ point = point[0]
+ start = segments[:, :2]
+ end = segments[:, 2:]
+ diff = start - end
+ a = diff[:, 1]
+ b = -diff[:, 0]
+ c = a * start[:, 0] + b * start[:, 1]
+
+ d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10)
+ theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi
+ theta[theta < 0.0] += 180
+ hough = np.concatenate([d[:, None], theta[:, None]], axis=-1)
+
+ d_quant = 1
+ theta_quant = 2
+ hough[:, 0] //= d_quant
+ hough[:, 1] //= theta_quant
+ _, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True)
+
+ acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32')
+ idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1
+ yx_indices = hough[indices, :].astype('int32')
+ acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts
+ idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices
+
+ acc_map_np = acc_map
+ # acc_map = acc_map[None, :, :, None]
+ #
+ # ### fast suppression using tensorflow op
+ # acc_map = tf.constant(acc_map, dtype=tf.float32)
+ # max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map)
+ # acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32)
+ # flatten_acc_map = tf.reshape(acc_map, [1, -1])
+ # topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts))
+ # _, h, w, _ = acc_map.shape
+ # y = tf.expand_dims(topk_indices // w, axis=-1)
+ # x = tf.expand_dims(topk_indices % w, axis=-1)
+ # yx = tf.concat([y, x], axis=-1)
+
+ ### fast suppression using pytorch op
+ acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0)
+ _,_, h, w = acc_map.shape
+ max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2)
+ acc_map = acc_map * ( (acc_map == max_acc_map).float() )
+ flatten_acc_map = acc_map.reshape([-1, ])
+
+ scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True)
+ yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1)
+ xx = torch.fmod(indices, w).unsqueeze(-1)
+ yx = torch.cat((yy, xx), dim=-1)
+
+ yx = yx.detach().cpu().numpy()
+
+ topk_values = scores.detach().cpu().numpy()
+ indices = idx_map[yx[:, 0], yx[:, 1]]
+ basis = 5 // 2
+
+ merged_segments = []
+ for yx_pt, max_indice, value in zip(yx, indices, topk_values):
+ y, x = yx_pt
+ if max_indice == -1 or value == 0:
+ continue
+ segment_list = []
+ for y_offset in range(-basis, basis + 1):
+ for x_offset in range(-basis, basis + 1):
+ indice = idx_map[y + y_offset, x + x_offset]
+ cnt = int(acc_map_np[y + y_offset, x + x_offset])
+ if indice != -1:
+ segment_list.append(segments[indice])
+ if cnt > 1:
+ check_cnt = 1
+ current_hough = hough[indice]
+ for new_indice, new_hough in enumerate(hough):
+ if (current_hough == new_hough).all() and indice != new_indice:
+ segment_list.append(segments[new_indice])
+ check_cnt += 1
+ if check_cnt == cnt:
+ break
+ group_segments = np.array(segment_list).reshape([-1, 2])
+ sorted_group_segments = np.sort(group_segments, axis=0)
+ x_min, y_min = sorted_group_segments[0, :]
+ x_max, y_max = sorted_group_segments[-1, :]
+
+ deg = theta[max_indice]
+ if deg >= 90:
+ merged_segments.append([x_min, y_max, x_max, y_min])
+ else:
+ merged_segments.append([x_min, y_min, x_max, y_max])
+
+ # 2. get intersections
+ new_segments = np.array(merged_segments) # (x1, y1, x2, y2)
+ start = new_segments[:, :2] # (x1, y1)
+ end = new_segments[:, 2:] # (x2, y2)
+ new_centers = (start + end) / 2.0
+ diff = start - end
+ dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1))
+
+ # ax + by = c
+ a = diff[:, 1]
+ b = -diff[:, 0]
+ c = a * start[:, 0] + b * start[:, 1]
+ pre_det = a[:, None] * b[None, :]
+ det = pre_det - np.transpose(pre_det)
+
+ pre_inter_y = a[:, None] * c[None, :]
+ inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10)
+ pre_inter_x = c[:, None] * b[None, :]
+ inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10)
+ inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32')
+
+ # 3. get corner information
+ # 3.1 get distance
+ '''
+ dist_segments:
+ | dist(0), dist(1), dist(2), ...|
+ dist_inter_to_segment1:
+ | dist(inter,0), dist(inter,0), dist(inter,0), ... |
+ | dist(inter,1), dist(inter,1), dist(inter,1), ... |
+ ...
+ dist_inter_to_semgnet2:
+ | dist(inter,0), dist(inter,1), dist(inter,2), ... |
+ | dist(inter,0), dist(inter,1), dist(inter,2), ... |
+ ...
+ '''
+
+ dist_inter_to_segment1_start = np.sqrt(
+ np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
+ dist_inter_to_segment1_end = np.sqrt(
+ np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
+ dist_inter_to_segment2_start = np.sqrt(
+ np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
+ dist_inter_to_segment2_end = np.sqrt(
+ np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
+
+ # sort ascending
+ dist_inter_to_segment1 = np.sort(
+ np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1),
+ axis=-1) # [n_batch, n_batch, 2]
+ dist_inter_to_segment2 = np.sort(
+ np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1),
+ axis=-1) # [n_batch, n_batch, 2]
+
+ # 3.2 get degree
+ inter_to_start = new_centers[:, None, :] - inter_pts
+ deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi
+ deg_inter_to_start[deg_inter_to_start < 0.0] += 360
+ inter_to_end = new_centers[None, :, :] - inter_pts
+ deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi
+ deg_inter_to_end[deg_inter_to_end < 0.0] += 360
+
+ '''
+ B -- G
+ | |
+ C -- R
+ B : blue / G: green / C: cyan / R: red
+
+ 0 -- 1
+ | |
+ 3 -- 2
+ '''
+ # rename variables
+ deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end
+ # sort deg ascending
+ deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1)
+
+ deg_diff_map = np.abs(deg1_map - deg2_map)
+ # we only consider the smallest degree of intersect
+ deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180]
+
+ # define available degree range
+ deg_range = [60, 120]
+
+ corner_dict = {corner_info: [] for corner_info in range(4)}
+ inter_points = []
+ for i in range(inter_pts.shape[0]):
+ for j in range(i + 1, inter_pts.shape[1]):
+ # i, j > line index, always i < j
+ x, y = inter_pts[i, j, :]
+ deg1, deg2 = deg_sort[i, j, :]
+ deg_diff = deg_diff_map[i, j]
+
+ check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1]
+
+ outside_ratio = params['outside_ratio'] # over ratio >>> drop it!
+ inside_ratio = params['inside_ratio'] # over ratio >>> drop it!
+ check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \
+ dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \
+ (dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \
+ dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \
+ ((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \
+ dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \
+ (dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \
+ dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio))
+
+ if check_degree and check_distance:
+ corner_info = None
+
+ if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \
+ (deg2 >= 315 and deg1 >= 45 and deg1 <= 120):
+ corner_info, color_info = 0, 'blue'
+ elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225):
+ corner_info, color_info = 1, 'green'
+ elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315):
+ corner_info, color_info = 2, 'black'
+ elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \
+ (deg2 >= 315 and deg1 >= 225 and deg1 <= 315):
+ corner_info, color_info = 3, 'cyan'
+ else:
+ corner_info, color_info = 4, 'red' # we don't use it
+ continue
+
+ corner_dict[corner_info].append([x, y, i, j])
+ inter_points.append([x, y])
+
+ square_list = []
+ connect_list = []
+ segments_list = []
+ for corner0 in corner_dict[0]:
+ for corner1 in corner_dict[1]:
+ connect01 = False
+ for corner0_line in corner0[2:]:
+ if corner0_line in corner1[2:]:
+ connect01 = True
+ break
+ if connect01:
+ for corner2 in corner_dict[2]:
+ connect12 = False
+ for corner1_line in corner1[2:]:
+ if corner1_line in corner2[2:]:
+ connect12 = True
+ break
+ if connect12:
+ for corner3 in corner_dict[3]:
+ connect23 = False
+ for corner2_line in corner2[2:]:
+ if corner2_line in corner3[2:]:
+ connect23 = True
+ break
+ if connect23:
+ for corner3_line in corner3[2:]:
+ if corner3_line in corner0[2:]:
+ # SQUARE!!!
+ '''
+ 0 -- 1
+ | |
+ 3 -- 2
+ square_list:
+ order: 0 > 1 > 2 > 3
+ | x0, y0, x1, y1, x2, y2, x3, y3 |
+ | x0, y0, x1, y1, x2, y2, x3, y3 |
+ ...
+ connect_list:
+ order: 01 > 12 > 23 > 30
+ | line_idx01, line_idx12, line_idx23, line_idx30 |
+ | line_idx01, line_idx12, line_idx23, line_idx30 |
+ ...
+ segments_list:
+ order: 0 > 1 > 2 > 3
+ | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
+ | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
+ ...
+ '''
+ square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2])
+ connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line])
+ segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:])
+
+ def check_outside_inside(segments_info, connect_idx):
+ # return 'outside or inside', min distance, cover_param, peri_param
+ if connect_idx == segments_info[0]:
+ check_dist_mat = dist_inter_to_segment1
+ else:
+ check_dist_mat = dist_inter_to_segment2
+
+ i, j = segments_info
+ min_dist, max_dist = check_dist_mat[i, j, :]
+ connect_dist = dist_segments[connect_idx]
+ if max_dist > connect_dist:
+ return 'outside', min_dist, 0, 1
+ else:
+ return 'inside', min_dist, -1, -1
+
+ top_square = None
+
+ try:
+ map_size = input_shape[0] / 2
+ squares = np.array(square_list).reshape([-1, 4, 2])
+ score_array = []
+ connect_array = np.array(connect_list)
+ segments_array = np.array(segments_list).reshape([-1, 4, 2])
+
+ # get degree of corners:
+ squares_rollup = np.roll(squares, 1, axis=1)
+ squares_rolldown = np.roll(squares, -1, axis=1)
+ vec1 = squares_rollup - squares
+ normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10)
+ vec2 = squares_rolldown - squares
+ normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10)
+ inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1) # [n_squares, 4]
+ squares_degree = np.arccos(inner_products) * 180 / np.pi # [n_squares, 4]
+
+ # get square score
+ overlap_scores = []
+ degree_scores = []
+ length_scores = []
+
+ for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree):
+ '''
+ 0 -- 1
+ | |
+ 3 -- 2
+
+ # segments: [4, 2]
+ # connects: [4]
+ '''
+
+ ###################################### OVERLAP SCORES
+ cover = 0
+ perimeter = 0
+ # check 0 > 1 > 2 > 3
+ square_length = []
+
+ for start_idx in range(4):
+ end_idx = (start_idx + 1) % 4
+
+ connect_idx = connects[start_idx] # segment idx of segment01
+ start_segments = segments[start_idx]
+ end_segments = segments[end_idx]
+
+ start_point = square[start_idx]
+ end_point = square[end_idx]
+
+ # check whether outside or inside
+ start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments,
+ connect_idx)
+ end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx)
+
+ cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min
+ perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min
+
+ square_length.append(
+ dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min)
+
+ overlap_scores.append(cover / perimeter)
+ ######################################
+ ###################################### DEGREE SCORES
+ '''
+ deg0 vs deg2
+ deg1 vs deg3
+ '''
+ deg0, deg1, deg2, deg3 = degree
+ deg_ratio1 = deg0 / deg2
+ if deg_ratio1 > 1.0:
+ deg_ratio1 = 1 / deg_ratio1
+ deg_ratio2 = deg1 / deg3
+ if deg_ratio2 > 1.0:
+ deg_ratio2 = 1 / deg_ratio2
+ degree_scores.append((deg_ratio1 + deg_ratio2) / 2)
+ ######################################
+ ###################################### LENGTH SCORES
+ '''
+ len0 vs len2
+ len1 vs len3
+ '''
+ len0, len1, len2, len3 = square_length
+ len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0
+ len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1
+ length_scores.append((len_ratio1 + len_ratio2) / 2)
+
+ ######################################
+
+ overlap_scores = np.array(overlap_scores)
+ overlap_scores /= np.max(overlap_scores)
+
+ degree_scores = np.array(degree_scores)
+ # degree_scores /= np.max(degree_scores)
+
+ length_scores = np.array(length_scores)
+
+ ###################################### AREA SCORES
+ area_scores = np.reshape(squares, [-1, 4, 2])
+ area_x = area_scores[:, :, 0]
+ area_y = area_scores[:, :, 1]
+ correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0]
+ area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1)
+ area_scores = 0.5 * np.abs(area_scores + correction)
+ area_scores /= (map_size * map_size) # np.max(area_scores)
+ ######################################
+
+ ###################################### CENTER SCORES
+ centers = np.array([[256 // 2, 256 // 2]], dtype='float32') # [1, 2]
+ # squares: [n, 4, 2]
+ square_centers = np.mean(squares, axis=1) # [n, 2]
+ center2center = np.sqrt(np.sum((centers - square_centers) ** 2))
+ center_scores = center2center / (map_size / np.sqrt(2.0))
+
+ '''
+ score_w = [overlap, degree, area, center, length]
+ '''
+ score_w = [0.0, 1.0, 10.0, 0.5, 1.0]
+ score_array = params['w_overlap'] * overlap_scores \
+ + params['w_degree'] * degree_scores \
+ + params['w_area'] * area_scores \
+ - params['w_center'] * center_scores \
+ + params['w_length'] * length_scores
+
+ best_square = []
+
+ sorted_idx = np.argsort(score_array)[::-1]
+ score_array = score_array[sorted_idx]
+ squares = squares[sorted_idx]
+
+ except Exception as e:
+ pass
+
+ '''return list
+ merged_lines, squares, scores
+ '''
+
+ try:
+ new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1]
+ new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0]
+ new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1]
+ new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0]
+ except:
+ new_segments = []
+
+ try:
+ squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1]
+ squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0]
+ except:
+ squares = []
+ score_array = []
+
+ try:
+ inter_points = np.array(inter_points)
+ inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1]
+ inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0]
+ except:
+ inter_points = []
+
+ return new_segments, squares, score_array, inter_points
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..aa50aff0b88acf132dda74e1e8d4049fc3bee6a3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Caroline Chan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ae3f9b6b24625867c2b5919b4a065aa37f2ca8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/__init__.py
@@ -0,0 +1,85 @@
+import os
+import types
+import warnings
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+from .nets.NNET import NNET
+
+
+# load model
+def load_checkpoint(fpath, model):
+ ckpt = torch.load(fpath, map_location='cpu')['model']
+
+ load_dict = {}
+ for k, v in ckpt.items():
+ if k.startswith('module.'):
+ k_ = k.replace('module.', '')
+ load_dict[k_] = v
+ else:
+ load_dict[k] = v
+
+ model.load_state_dict(load_dict)
+ return model
+
+class NormalBaeDetector:
+ def __init__(self, model):
+ self.model = model
+ self.norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="scannet.pt"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ args = types.SimpleNamespace()
+ args.mode = 'client'
+ args.architecture = 'BN'
+ args.pretrained = 'scannet'
+ args.sampling_ratio = 0.4
+ args.importance_ratio = 0.7
+ model = NNET(args)
+ model = load_checkpoint(model_path, model)
+ model.eval()
+
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+
+ def __call__(self, input_image, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ image_normal = detected_map
+ with torch.no_grad():
+ image_normal = torch.from_numpy(image_normal).float().to(self.device)
+ image_normal = image_normal / 255.0
+ image_normal = rearrange(image_normal, 'h w c -> 1 c h w')
+ image_normal = self.norm(image_normal)
+
+ normal = self.model(image_normal)
+ normal = normal[0][-1][:, :3]
+ # d = torch.sum(normal ** 2.0, dim=1, keepdim=True) ** 0.5
+ # d = torch.maximum(d, torch.ones_like(d) * 1e-5)
+ # normal /= d
+ normal = ((normal + 1) * 0.5).clip(0, 1)
+
+ normal = rearrange(normal[0], 'c h w -> h w c').cpu().numpy()
+ normal_image = (normal * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = remove_pad(HWC3(normal_image))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/NNET.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/NNET.py
new file mode 100644
index 0000000000000000000000000000000000000000..3be2a60516eb33bd8235e0ca2f489e9b4c08b1e6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/NNET.py
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .submodules.encoder import Encoder
+from .submodules.decoder import Decoder
+
+
+class NNET(nn.Module):
+ def __init__(self, args):
+ super(NNET, self).__init__()
+ self.encoder = Encoder()
+ self.decoder = Decoder(args)
+
+ def get_1x_lr_params(self): # lr/10 learning rate
+ return self.encoder.parameters()
+
+ def get_10x_lr_params(self): # lr learning rate
+ return self.decoder.parameters()
+
+ def forward(self, img, **kwargs):
+ return self.decoder(self.encoder(img), **kwargs)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/baseline.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb5c98a81b430fa260edaea546faeb3dd58d304f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/baseline.py
@@ -0,0 +1,85 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .submodules.submodules import UpSampleBN, norm_normalize
+
+
+# This is the baseline encoder-decoder we used in the ablation study
+class NNET(nn.Module):
+ def __init__(self, args=None):
+ super(NNET, self).__init__()
+ self.encoder = Encoder()
+ self.decoder = Decoder(num_classes=4)
+
+ def forward(self, x, **kwargs):
+ out = self.decoder(self.encoder(x), **kwargs)
+
+ # Bilinearly upsample the output to match the input resolution
+ up_out = F.interpolate(out, size=[x.size(2), x.size(3)], mode='bilinear', align_corners=False)
+
+ # L2-normalize the first three channels / ensure positive value for concentration parameters (kappa)
+ up_out = norm_normalize(up_out)
+ return up_out
+
+ def get_1x_lr_params(self): # lr/10 learning rate
+ return self.encoder.parameters()
+
+ def get_10x_lr_params(self): # lr learning rate
+ modules = [self.decoder]
+ for m in modules:
+ yield from m.parameters()
+
+
+# Encoder
+class Encoder(nn.Module):
+ def __init__(self):
+ super(Encoder, self).__init__()
+
+ basemodel_name = 'tf_efficientnet_b5_ap'
+ basemodel = torch.hub.load('rwightman/gen-efficientnet-pytorch', basemodel_name, pretrained=True)
+
+ # Remove last layer
+ basemodel.global_pool = nn.Identity()
+ basemodel.classifier = nn.Identity()
+
+ self.original_model = basemodel
+
+ def forward(self, x):
+ features = [x]
+ for k, v in self.original_model._modules.items():
+ if (k == 'blocks'):
+ for ki, vi in v._modules.items():
+ features.append(vi(features[-1]))
+ else:
+ features.append(v(features[-1]))
+ return features
+
+
+# Decoder (no pixel-wise MLP, no uncertainty-guided sampling)
+class Decoder(nn.Module):
+ def __init__(self, num_classes=4):
+ super(Decoder, self).__init__()
+ self.conv2 = nn.Conv2d(2048, 2048, kernel_size=1, stride=1, padding=0)
+ self.up1 = UpSampleBN(skip_input=2048 + 176, output_features=1024)
+ self.up2 = UpSampleBN(skip_input=1024 + 64, output_features=512)
+ self.up3 = UpSampleBN(skip_input=512 + 40, output_features=256)
+ self.up4 = UpSampleBN(skip_input=256 + 24, output_features=128)
+ self.conv3 = nn.Conv2d(128, num_classes, kernel_size=3, stride=1, padding=1)
+
+ def forward(self, features):
+ x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], features[8], features[11]
+ x_d0 = self.conv2(x_block4)
+ x_d1 = self.up1(x_d0, x_block3)
+ x_d2 = self.up2(x_d1, x_block2)
+ x_d3 = self.up3(x_d2, x_block1)
+ x_d4 = self.up4(x_d3, x_block0)
+ out = self.conv3(x_d4)
+ return out
+
+
+if __name__ == '__main__':
+ model = Baseline()
+ x = torch.rand(2, 3, 480, 640)
+ out = model(x)
+ print(out.shape)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/decoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0161873d82d545c6f73c675a05460a44b290e956
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/decoder.py
@@ -0,0 +1,202 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .submodules import UpSampleBN, UpSampleGN, norm_normalize, sample_points
+
+
+class Decoder(nn.Module):
+ def __init__(self, args):
+ super(Decoder, self).__init__()
+
+ # hyper-parameter for sampling
+ self.sampling_ratio = args.sampling_ratio
+ self.importance_ratio = args.importance_ratio
+
+ # feature-map
+ self.conv2 = nn.Conv2d(2048, 2048, kernel_size=1, stride=1, padding=0)
+ if args.architecture == 'BN':
+ self.up1 = UpSampleBN(skip_input=2048 + 176, output_features=1024)
+ self.up2 = UpSampleBN(skip_input=1024 + 64, output_features=512)
+ self.up3 = UpSampleBN(skip_input=512 + 40, output_features=256)
+ self.up4 = UpSampleBN(skip_input=256 + 24, output_features=128)
+
+ elif args.architecture == 'GN':
+ self.up1 = UpSampleGN(skip_input=2048 + 176, output_features=1024)
+ self.up2 = UpSampleGN(skip_input=1024 + 64, output_features=512)
+ self.up3 = UpSampleGN(skip_input=512 + 40, output_features=256)
+ self.up4 = UpSampleGN(skip_input=256 + 24, output_features=128)
+
+ else:
+ raise Exception('invalid architecture')
+
+ # produces 1/8 res output
+ self.out_conv_res8 = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
+
+ # produces 1/4 res output
+ self.out_conv_res4 = nn.Sequential(
+ nn.Conv1d(512 + 4, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 4, kernel_size=1),
+ )
+
+ # produces 1/2 res output
+ self.out_conv_res2 = nn.Sequential(
+ nn.Conv1d(256 + 4, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 4, kernel_size=1),
+ )
+
+ # produces 1/1 res output
+ self.out_conv_res1 = nn.Sequential(
+ nn.Conv1d(128 + 4, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(),
+ nn.Conv1d(128, 4, kernel_size=1),
+ )
+
+ def forward(self, features, gt_norm_mask=None, mode='test'):
+ x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], features[8], features[11]
+
+ # generate feature-map
+
+ x_d0 = self.conv2(x_block4) # x_d0 : [2, 2048, 15, 20] 1/32 res
+ x_d1 = self.up1(x_d0, x_block3) # x_d1 : [2, 1024, 30, 40] 1/16 res
+ x_d2 = self.up2(x_d1, x_block2) # x_d2 : [2, 512, 60, 80] 1/8 res
+ x_d3 = self.up3(x_d2, x_block1) # x_d3: [2, 256, 120, 160] 1/4 res
+ x_d4 = self.up4(x_d3, x_block0) # x_d4: [2, 128, 240, 320] 1/2 res
+
+ # 1/8 res output
+ out_res8 = self.out_conv_res8(x_d2) # out_res8: [2, 4, 60, 80] 1/8 res output
+ out_res8 = norm_normalize(out_res8) # out_res8: [2, 4, 60, 80] 1/8 res output
+
+ ################################################################################################################
+ # out_res4
+ ################################################################################################################
+
+ if mode == 'train':
+ # upsampling ... out_res8: [2, 4, 60, 80] -> out_res8_res4: [2, 4, 120, 160]
+ out_res8_res4 = F.interpolate(out_res8, scale_factor=2, mode='bilinear', align_corners=True)
+ B, _, H, W = out_res8_res4.shape
+
+ # samples: [B, 1, N, 2]
+ point_coords_res4, rows_int, cols_int = sample_points(out_res8_res4.detach(), gt_norm_mask,
+ sampling_ratio=self.sampling_ratio,
+ beta=self.importance_ratio)
+
+ # output (needed for evaluation / visualization)
+ out_res4 = out_res8_res4
+
+ # grid_sample feature-map
+ feat_res4 = F.grid_sample(x_d2, point_coords_res4, mode='bilinear', align_corners=True) # (B, 512, 1, N)
+ init_pred = F.grid_sample(out_res8, point_coords_res4, mode='bilinear', align_corners=True) # (B, 4, 1, N)
+ feat_res4 = torch.cat([feat_res4, init_pred], dim=1) # (B, 512+4, 1, N)
+
+ # prediction (needed to compute loss)
+ samples_pred_res4 = self.out_conv_res4(feat_res4[:, :, 0, :]) # (B, 4, N)
+ samples_pred_res4 = norm_normalize(samples_pred_res4) # (B, 4, N) - normalized
+
+ for i in range(B):
+ out_res4[i, :, rows_int[i, :], cols_int[i, :]] = samples_pred_res4[i, :, :]
+
+ else:
+ # grid_sample feature-map
+ feat_map = F.interpolate(x_d2, scale_factor=2, mode='bilinear', align_corners=True)
+ init_pred = F.interpolate(out_res8, scale_factor=2, mode='bilinear', align_corners=True)
+ feat_map = torch.cat([feat_map, init_pred], dim=1) # (B, 512+4, H, W)
+ B, _, H, W = feat_map.shape
+
+ # try all pixels
+ out_res4 = self.out_conv_res4(feat_map.view(B, 512 + 4, -1)) # (B, 4, N)
+ out_res4 = norm_normalize(out_res4) # (B, 4, N) - normalized
+ out_res4 = out_res4.view(B, 4, H, W)
+ samples_pred_res4 = point_coords_res4 = None
+
+ ################################################################################################################
+ # out_res2
+ ################################################################################################################
+
+ if mode == 'train':
+
+ # upsampling ... out_res4: [2, 4, 120, 160] -> out_res4_res2: [2, 4, 240, 320]
+ out_res4_res2 = F.interpolate(out_res4, scale_factor=2, mode='bilinear', align_corners=True)
+ B, _, H, W = out_res4_res2.shape
+
+ # samples: [B, 1, N, 2]
+ point_coords_res2, rows_int, cols_int = sample_points(out_res4_res2.detach(), gt_norm_mask,
+ sampling_ratio=self.sampling_ratio,
+ beta=self.importance_ratio)
+
+ # output (needed for evaluation / visualization)
+ out_res2 = out_res4_res2
+
+ # grid_sample feature-map
+ feat_res2 = F.grid_sample(x_d3, point_coords_res2, mode='bilinear', align_corners=True) # (B, 256, 1, N)
+ init_pred = F.grid_sample(out_res4, point_coords_res2, mode='bilinear', align_corners=True) # (B, 4, 1, N)
+ feat_res2 = torch.cat([feat_res2, init_pred], dim=1) # (B, 256+4, 1, N)
+
+ # prediction (needed to compute loss)
+ samples_pred_res2 = self.out_conv_res2(feat_res2[:, :, 0, :]) # (B, 4, N)
+ samples_pred_res2 = norm_normalize(samples_pred_res2) # (B, 4, N) - normalized
+
+ for i in range(B):
+ out_res2[i, :, rows_int[i, :], cols_int[i, :]] = samples_pred_res2[i, :, :]
+
+ else:
+ # grid_sample feature-map
+ feat_map = F.interpolate(x_d3, scale_factor=2, mode='bilinear', align_corners=True)
+ init_pred = F.interpolate(out_res4, scale_factor=2, mode='bilinear', align_corners=True)
+ feat_map = torch.cat([feat_map, init_pred], dim=1) # (B, 512+4, H, W)
+ B, _, H, W = feat_map.shape
+
+ out_res2 = self.out_conv_res2(feat_map.view(B, 256 + 4, -1)) # (B, 4, N)
+ out_res2 = norm_normalize(out_res2) # (B, 4, N) - normalized
+ out_res2 = out_res2.view(B, 4, H, W)
+ samples_pred_res2 = point_coords_res2 = None
+
+ ################################################################################################################
+ # out_res1
+ ################################################################################################################
+
+ if mode == 'train':
+ # upsampling ... out_res4: [2, 4, 120, 160] -> out_res4_res2: [2, 4, 240, 320]
+ out_res2_res1 = F.interpolate(out_res2, scale_factor=2, mode='bilinear', align_corners=True)
+ B, _, H, W = out_res2_res1.shape
+
+ # samples: [B, 1, N, 2]
+ point_coords_res1, rows_int, cols_int = sample_points(out_res2_res1.detach(), gt_norm_mask,
+ sampling_ratio=self.sampling_ratio,
+ beta=self.importance_ratio)
+
+ # output (needed for evaluation / visualization)
+ out_res1 = out_res2_res1
+
+ # grid_sample feature-map
+ feat_res1 = F.grid_sample(x_d4, point_coords_res1, mode='bilinear', align_corners=True) # (B, 128, 1, N)
+ init_pred = F.grid_sample(out_res2, point_coords_res1, mode='bilinear', align_corners=True) # (B, 4, 1, N)
+ feat_res1 = torch.cat([feat_res1, init_pred], dim=1) # (B, 128+4, 1, N)
+
+ # prediction (needed to compute loss)
+ samples_pred_res1 = self.out_conv_res1(feat_res1[:, :, 0, :]) # (B, 4, N)
+ samples_pred_res1 = norm_normalize(samples_pred_res1) # (B, 4, N) - normalized
+
+ for i in range(B):
+ out_res1[i, :, rows_int[i, :], cols_int[i, :]] = samples_pred_res1[i, :, :]
+
+ else:
+ # grid_sample feature-map
+ feat_map = F.interpolate(x_d4, scale_factor=2, mode='bilinear', align_corners=True)
+ init_pred = F.interpolate(out_res2, scale_factor=2, mode='bilinear', align_corners=True)
+ feat_map = torch.cat([feat_map, init_pred], dim=1) # (B, 512+4, H, W)
+ B, _, H, W = feat_map.shape
+
+ out_res1 = self.out_conv_res1(feat_map.view(B, 128 + 4, -1)) # (B, 4, N)
+ out_res1 = norm_normalize(out_res1) # (B, 4, N) - normalized
+ out_res1 = out_res1.view(B, 4, H, W)
+ samples_pred_res1 = point_coords_res1 = None
+
+ return [out_res8, out_res4, out_res2, out_res1], \
+ [out_res8, samples_pred_res4, samples_pred_res2, samples_pred_res1], \
+ [None, point_coords_res4, point_coords_res2, point_coords_res1]
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/.gitignore b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b44ec1da9c1b744d54735281b9509ac7aa8cbbcf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/.gitignore
@@ -0,0 +1,109 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# pytorch stuff
+*.pth
+*.onnx
+*.pb
+
+trained_models/
+.fuse_hidden*
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/BENCHMARK.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/BENCHMARK.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0491e2398cbe65b358dcaf7b020d5b599e18d21
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/BENCHMARK.md
@@ -0,0 +1,555 @@
+# Model Performance Benchmarks
+
+All benchmarks run as per:
+
+```
+python onnx_export.py --model mobilenetv3_100 ./mobilenetv3_100.onnx
+python onnx_optimize.py ./mobilenetv3_100.onnx --output mobilenetv3_100-opt.onnx
+python onnx_to_caffe.py ./mobilenetv3_100.onnx --c2-prefix mobilenetv3
+python onnx_to_caffe.py ./mobilenetv3_100-opt.onnx --c2-prefix mobilenetv3-opt
+python caffe2_benchmark.py --c2-init ./mobilenetv3.init.pb --c2-predict ./mobilenetv3.predict.pb
+python caffe2_benchmark.py --c2-init ./mobilenetv3-opt.init.pb --c2-predict ./mobilenetv3-opt.predict.pb
+```
+
+## EfficientNet-B0
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 49.2862. Iters per second: 20.2897
+Time per operator type:
+ 29.7378 ms. 60.5145%. Conv
+ 12.1785 ms. 24.7824%. Sigmoid
+ 3.62811 ms. 7.38297%. SpatialBN
+ 2.98444 ms. 6.07314%. Mul
+ 0.326902 ms. 0.665225%. AveragePool
+ 0.197317 ms. 0.401528%. FC
+ 0.0852877 ms. 0.173555%. Add
+ 0.0032607 ms. 0.00663532%. Squeeze
+ 49.1416 ms in Total
+FLOP per operator type:
+ 0.76907 GFLOP. 95.2696%. Conv
+ 0.0269508 GFLOP. 3.33857%. SpatialBN
+ 0.00846444 GFLOP. 1.04855%. Mul
+ 0.002561 GFLOP. 0.317248%. FC
+ 0.000210112 GFLOP. 0.0260279%. Add
+ 0.807256 GFLOP in Total
+Feature Memory Read per operator type:
+ 58.5253 MB. 43.0891%. Mul
+ 43.2015 MB. 31.807%. Conv
+ 27.2869 MB. 20.0899%. SpatialBN
+ 5.12912 MB. 3.77631%. FC
+ 1.6809 MB. 1.23756%. Add
+ 135.824 MB in Total
+Feature Memory Written per operator type:
+ 33.8578 MB. 38.1965%. Mul
+ 26.9881 MB. 30.4465%. Conv
+ 26.9508 MB. 30.4044%. SpatialBN
+ 0.840448 MB. 0.948147%. Add
+ 0.004 MB. 0.00451258%. FC
+ 88.6412 MB in Total
+Parameter Memory per operator type:
+ 15.8248 MB. 74.9391%. Conv
+ 5.124 MB. 24.265%. FC
+ 0.168064 MB. 0.795877%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 21.1168 MB in Total
+```
+### Optimized
+```
+Main run finished. Milliseconds per iter: 46.0838. Iters per second: 21.6996
+Time per operator type:
+ 29.776 ms. 65.002%. Conv
+ 12.2803 ms. 26.8084%. Sigmoid
+ 3.15073 ms. 6.87815%. Mul
+ 0.328651 ms. 0.717456%. AveragePool
+ 0.186237 ms. 0.406563%. FC
+ 0.0832429 ms. 0.181722%. Add
+ 0.0026184 ms. 0.00571606%. Squeeze
+ 45.8078 ms in Total
+FLOP per operator type:
+ 0.76907 GFLOP. 98.5601%. Conv
+ 0.00846444 GFLOP. 1.08476%. Mul
+ 0.002561 GFLOP. 0.328205%. FC
+ 0.000210112 GFLOP. 0.0269269%. Add
+ 0.780305 GFLOP in Total
+Feature Memory Read per operator type:
+ 58.5253 MB. 53.8803%. Mul
+ 43.2855 MB. 39.8501%. Conv
+ 5.12912 MB. 4.72204%. FC
+ 1.6809 MB. 1.54749%. Add
+ 108.621 MB in Total
+Feature Memory Written per operator type:
+ 33.8578 MB. 54.8834%. Mul
+ 26.9881 MB. 43.7477%. Conv
+ 0.840448 MB. 1.36237%. Add
+ 0.004 MB. 0.00648399%. FC
+ 61.6904 MB in Total
+Parameter Memory per operator type:
+ 15.8248 MB. 75.5403%. Conv
+ 5.124 MB. 24.4597%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 20.9488 MB in Total
+```
+
+## EfficientNet-B1
+### Optimized
+```
+Main run finished. Milliseconds per iter: 71.8102. Iters per second: 13.9256
+Time per operator type:
+ 45.7915 ms. 66.3206%. Conv
+ 17.8718 ms. 25.8841%. Sigmoid
+ 4.44132 ms. 6.43244%. Mul
+ 0.51001 ms. 0.738658%. AveragePool
+ 0.233283 ms. 0.337868%. Add
+ 0.194986 ms. 0.282402%. FC
+ 0.00268255 ms. 0.00388519%. Squeeze
+ 69.0456 ms in Total
+FLOP per operator type:
+ 1.37105 GFLOP. 98.7673%. Conv
+ 0.0138759 GFLOP. 0.99959%. Mul
+ 0.002561 GFLOP. 0.184489%. FC
+ 0.000674432 GFLOP. 0.0485847%. Add
+ 1.38816 GFLOP in Total
+Feature Memory Read per operator type:
+ 94.624 MB. 54.0789%. Mul
+ 69.8255 MB. 39.9062%. Conv
+ 5.39546 MB. 3.08357%. Add
+ 5.12912 MB. 2.93136%. FC
+ 174.974 MB in Total
+Feature Memory Written per operator type:
+ 55.5035 MB. 54.555%. Mul
+ 43.5333 MB. 42.7894%. Conv
+ 2.69773 MB. 2.65163%. Add
+ 0.004 MB. 0.00393165%. FC
+ 101.739 MB in Total
+Parameter Memory per operator type:
+ 25.7479 MB. 83.4024%. Conv
+ 5.124 MB. 16.5976%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 30.8719 MB in Total
+```
+
+## EfficientNet-B2
+### Optimized
+```
+Main run finished. Milliseconds per iter: 92.28. Iters per second: 10.8366
+Time per operator type:
+ 61.4627 ms. 67.5845%. Conv
+ 22.7458 ms. 25.0113%. Sigmoid
+ 5.59931 ms. 6.15701%. Mul
+ 0.642567 ms. 0.706568%. AveragePool
+ 0.272795 ms. 0.299965%. Add
+ 0.216178 ms. 0.237709%. FC
+ 0.00268895 ms. 0.00295677%. Squeeze
+ 90.942 ms in Total
+FLOP per operator type:
+ 1.98431 GFLOP. 98.9343%. Conv
+ 0.0177039 GFLOP. 0.882686%. Mul
+ 0.002817 GFLOP. 0.140451%. FC
+ 0.000853984 GFLOP. 0.0425782%. Add
+ 2.00568 GFLOP in Total
+Feature Memory Read per operator type:
+ 120.609 MB. 54.9637%. Mul
+ 86.3512 MB. 39.3519%. Conv
+ 6.83187 MB. 3.11341%. Add
+ 5.64163 MB. 2.571%. FC
+ 219.433 MB in Total
+Feature Memory Written per operator type:
+ 70.8155 MB. 54.6573%. Mul
+ 55.3273 MB. 42.7031%. Conv
+ 3.41594 MB. 2.63651%. Add
+ 0.004 MB. 0.00308731%. FC
+ 129.563 MB in Total
+Parameter Memory per operator type:
+ 30.4721 MB. 84.3913%. Conv
+ 5.636 MB. 15.6087%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 36.1081 MB in Total
+```
+
+## MixNet-M
+### Optimized
+```
+Main run finished. Milliseconds per iter: 63.1122. Iters per second: 15.8448
+Time per operator type:
+ 48.1139 ms. 75.2052%. Conv
+ 7.1341 ms. 11.1511%. Sigmoid
+ 2.63706 ms. 4.12189%. SpatialBN
+ 1.73186 ms. 2.70701%. Mul
+ 1.38707 ms. 2.16809%. Split
+ 1.29322 ms. 2.02139%. Concat
+ 1.00093 ms. 1.56452%. Relu
+ 0.235309 ms. 0.367803%. Add
+ 0.221579 ms. 0.346343%. FC
+ 0.219315 ms. 0.342803%. AveragePool
+ 0.00250145 ms. 0.00390993%. Squeeze
+ 63.9768 ms in Total
+FLOP per operator type:
+ 0.675273 GFLOP. 95.5827%. Conv
+ 0.0221072 GFLOP. 3.12921%. SpatialBN
+ 0.00538445 GFLOP. 0.762152%. Mul
+ 0.003073 GFLOP. 0.434973%. FC
+ 0.000642488 GFLOP. 0.0909421%. Add
+ 0 GFLOP. 0%. Concat
+ 0 GFLOP. 0%. Relu
+ 0.70648 GFLOP in Total
+Feature Memory Read per operator type:
+ 46.8424 MB. 30.502%. Conv
+ 36.8626 MB. 24.0036%. Mul
+ 22.3152 MB. 14.5309%. SpatialBN
+ 22.1074 MB. 14.3955%. Concat
+ 14.1496 MB. 9.21372%. Relu
+ 6.15414 MB. 4.00735%. FC
+ 5.1399 MB. 3.34692%. Add
+ 153.571 MB in Total
+Feature Memory Written per operator type:
+ 32.7672 MB. 28.4331%. Conv
+ 22.1072 MB. 19.1831%. Concat
+ 22.1072 MB. 19.1831%. SpatialBN
+ 21.5378 MB. 18.689%. Mul
+ 14.1496 MB. 12.2781%. Relu
+ 2.56995 MB. 2.23003%. Add
+ 0.004 MB. 0.00347092%. FC
+ 115.243 MB in Total
+Parameter Memory per operator type:
+ 13.7059 MB. 68.674%. Conv
+ 6.148 MB. 30.8049%. FC
+ 0.104 MB. 0.521097%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Concat
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 19.9579 MB in Total
+```
+
+## TF MobileNet-V3 Large 1.0
+
+### Optimized
+```
+Main run finished. Milliseconds per iter: 22.0495. Iters per second: 45.3525
+Time per operator type:
+ 17.437 ms. 80.0087%. Conv
+ 1.27662 ms. 5.8577%. Add
+ 1.12759 ms. 5.17387%. Div
+ 0.701155 ms. 3.21721%. Mul
+ 0.562654 ms. 2.58171%. Relu
+ 0.431144 ms. 1.97828%. Clip
+ 0.156902 ms. 0.719936%. FC
+ 0.0996858 ms. 0.457402%. AveragePool
+ 0.00112455 ms. 0.00515993%. Flatten
+ 21.7939 ms in Total
+FLOP per operator type:
+ 0.43062 GFLOP. 98.1484%. Conv
+ 0.002561 GFLOP. 0.583713%. FC
+ 0.00210867 GFLOP. 0.480616%. Mul
+ 0.00193868 GFLOP. 0.441871%. Add
+ 0.00151532 GFLOP. 0.345377%. Div
+ 0 GFLOP. 0%. Relu
+ 0.438743 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.7967 MB. 43.9391%. Conv
+ 14.496 MB. 18.3046%. Mul
+ 9.44828 MB. 11.9307%. Add
+ 9.26157 MB. 11.6949%. Relu
+ 6.0614 MB. 7.65395%. Div
+ 5.12912 MB. 6.47673%. FC
+ 79.193 MB in Total
+Feature Memory Written per operator type:
+ 17.6247 MB. 35.8656%. Conv
+ 9.26157 MB. 18.847%. Relu
+ 8.43469 MB. 17.1643%. Mul
+ 7.75472 MB. 15.7806%. Add
+ 6.06128 MB. 12.3345%. Div
+ 0.004 MB. 0.00813985%. FC
+ 49.1409 MB in Total
+Parameter Memory per operator type:
+ 16.6851 MB. 76.5052%. Conv
+ 5.124 MB. 23.4948%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Div
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 21.8091 MB in Total
+```
+
+## MobileNet-V3 (RW)
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 24.8316. Iters per second: 40.2712
+Time per operator type:
+ 15.9266 ms. 69.2624%. Conv
+ 2.36551 ms. 10.2873%. SpatialBN
+ 1.39102 ms. 6.04936%. Add
+ 1.30327 ms. 5.66773%. Div
+ 0.737014 ms. 3.20517%. Mul
+ 0.639697 ms. 2.78195%. Relu
+ 0.375681 ms. 1.63378%. Clip
+ 0.153126 ms. 0.665921%. FC
+ 0.0993787 ms. 0.432184%. AveragePool
+ 0.0032632 ms. 0.0141912%. Squeeze
+ 22.9946 ms in Total
+FLOP per operator type:
+ 0.430616 GFLOP. 94.4041%. Conv
+ 0.0175992 GFLOP. 3.85829%. SpatialBN
+ 0.002561 GFLOP. 0.561449%. FC
+ 0.00210961 GFLOP. 0.46249%. Mul
+ 0.00173891 GFLOP. 0.381223%. Add
+ 0.00151626 GFLOP. 0.33241%. Div
+ 0 GFLOP. 0%. Relu
+ 0.456141 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.7354 MB. 36.4363%. Conv
+ 17.7944 MB. 18.6658%. SpatialBN
+ 14.5035 MB. 15.2137%. Mul
+ 9.25778 MB. 9.71113%. Relu
+ 7.84641 MB. 8.23064%. Add
+ 6.06516 MB. 6.36216%. Div
+ 5.12912 MB. 5.38029%. FC
+ 95.3317 MB in Total
+Feature Memory Written per operator type:
+ 17.6246 MB. 26.7264%. Conv
+ 17.5992 MB. 26.6878%. SpatialBN
+ 9.25778 MB. 14.0387%. Relu
+ 8.43843 MB. 12.7962%. Mul
+ 6.95565 MB. 10.5477%. Add
+ 6.06502 MB. 9.19713%. Div
+ 0.004 MB. 0.00606568%. FC
+ 65.9447 MB in Total
+Parameter Memory per operator type:
+ 16.6778 MB. 76.1564%. Conv
+ 5.124 MB. 23.3979%. FC
+ 0.0976 MB. 0.445674%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Div
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 21.8994 MB in Total
+
+```
+### Optimized
+
+```
+Main run finished. Milliseconds per iter: 22.0981. Iters per second: 45.2527
+Time per operator type:
+ 17.146 ms. 78.8965%. Conv
+ 1.38453 ms. 6.37084%. Add
+ 1.30991 ms. 6.02749%. Div
+ 0.685417 ms. 3.15391%. Mul
+ 0.532589 ms. 2.45068%. Relu
+ 0.418263 ms. 1.92461%. Clip
+ 0.15128 ms. 0.696106%. FC
+ 0.102065 ms. 0.469648%. AveragePool
+ 0.0022143 ms. 0.010189%. Squeeze
+ 21.7323 ms in Total
+FLOP per operator type:
+ 0.430616 GFLOP. 98.1927%. Conv
+ 0.002561 GFLOP. 0.583981%. FC
+ 0.00210961 GFLOP. 0.481051%. Mul
+ 0.00173891 GFLOP. 0.396522%. Add
+ 0.00151626 GFLOP. 0.34575%. Div
+ 0 GFLOP. 0%. Relu
+ 0.438542 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.7842 MB. 44.833%. Conv
+ 14.5035 MB. 18.6934%. Mul
+ 9.25778 MB. 11.9323%. Relu
+ 7.84641 MB. 10.1132%. Add
+ 6.06516 MB. 7.81733%. Div
+ 5.12912 MB. 6.61087%. FC
+ 77.5861 MB in Total
+Feature Memory Written per operator type:
+ 17.6246 MB. 36.4556%. Conv
+ 9.25778 MB. 19.1492%. Relu
+ 8.43843 MB. 17.4544%. Mul
+ 6.95565 MB. 14.3874%. Add
+ 6.06502 MB. 12.5452%. Div
+ 0.004 MB. 0.00827378%. FC
+ 48.3455 MB in Total
+Parameter Memory per operator type:
+ 16.6778 MB. 76.4973%. Conv
+ 5.124 MB. 23.5027%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Div
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 21.8018 MB in Total
+
+```
+
+## MnasNet-A1
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 30.0892. Iters per second: 33.2345
+Time per operator type:
+ 24.4656 ms. 79.0905%. Conv
+ 4.14958 ms. 13.4144%. SpatialBN
+ 1.60598 ms. 5.19169%. Relu
+ 0.295219 ms. 0.95436%. Mul
+ 0.187609 ms. 0.606486%. FC
+ 0.120556 ms. 0.389724%. AveragePool
+ 0.09036 ms. 0.292109%. Add
+ 0.015727 ms. 0.050841%. Sigmoid
+ 0.00306205 ms. 0.00989875%. Squeeze
+ 30.9337 ms in Total
+FLOP per operator type:
+ 0.620598 GFLOP. 95.6434%. Conv
+ 0.0248873 GFLOP. 3.8355%. SpatialBN
+ 0.002561 GFLOP. 0.394688%. FC
+ 0.000597408 GFLOP. 0.0920695%. Mul
+ 0.000222656 GFLOP. 0.0343146%. Add
+ 0 GFLOP. 0%. Relu
+ 0.648867 GFLOP in Total
+Feature Memory Read per operator type:
+ 35.5457 MB. 38.4109%. Conv
+ 25.1552 MB. 27.1829%. SpatialBN
+ 22.5235 MB. 24.339%. Relu
+ 5.12912 MB. 5.54256%. FC
+ 2.40586 MB. 2.59978%. Mul
+ 1.78125 MB. 1.92483%. Add
+ 92.5406 MB in Total
+Feature Memory Written per operator type:
+ 24.9042 MB. 32.9424%. Conv
+ 24.8873 MB. 32.92%. SpatialBN
+ 22.5235 MB. 29.7932%. Relu
+ 2.38963 MB. 3.16092%. Mul
+ 0.890624 MB. 1.17809%. Add
+ 0.004 MB. 0.00529106%. FC
+ 75.5993 MB in Total
+Parameter Memory per operator type:
+ 10.2732 MB. 66.1459%. Conv
+ 5.124 MB. 32.9917%. FC
+ 0.133952 MB. 0.86247%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 15.5312 MB in Total
+```
+
+### Optimized
+```
+Main run finished. Milliseconds per iter: 24.2367. Iters per second: 41.2597
+Time per operator type:
+ 22.0547 ms. 91.1375%. Conv
+ 1.49096 ms. 6.16116%. Relu
+ 0.253417 ms. 1.0472%. Mul
+ 0.18506 ms. 0.76473%. FC
+ 0.112942 ms. 0.466717%. AveragePool
+ 0.086769 ms. 0.358559%. Add
+ 0.0127889 ms. 0.0528479%. Sigmoid
+ 0.0027346 ms. 0.0113003%. Squeeze
+ 24.1994 ms in Total
+FLOP per operator type:
+ 0.620598 GFLOP. 99.4581%. Conv
+ 0.002561 GFLOP. 0.41043%. FC
+ 0.000597408 GFLOP. 0.0957417%. Mul
+ 0.000222656 GFLOP. 0.0356832%. Add
+ 0 GFLOP. 0%. Relu
+ 0.623979 GFLOP in Total
+Feature Memory Read per operator type:
+ 35.6127 MB. 52.7968%. Conv
+ 22.5235 MB. 33.3917%. Relu
+ 5.12912 MB. 7.60406%. FC
+ 2.40586 MB. 3.56675%. Mul
+ 1.78125 MB. 2.64075%. Add
+ 67.4524 MB in Total
+Feature Memory Written per operator type:
+ 24.9042 MB. 49.1092%. Conv
+ 22.5235 MB. 44.4145%. Relu
+ 2.38963 MB. 4.71216%. Mul
+ 0.890624 MB. 1.75624%. Add
+ 0.004 MB. 0.00788768%. FC
+ 50.712 MB in Total
+Parameter Memory per operator type:
+ 10.2732 MB. 66.7213%. Conv
+ 5.124 MB. 33.2787%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Mul
+ 0 MB. 0%. Relu
+ 15.3972 MB in Total
+```
+## MnasNet-B1
+
+### Unoptimized
+```
+Main run finished. Milliseconds per iter: 28.3109. Iters per second: 35.322
+Time per operator type:
+ 29.1121 ms. 83.3081%. Conv
+ 4.14959 ms. 11.8746%. SpatialBN
+ 1.35823 ms. 3.88675%. Relu
+ 0.186188 ms. 0.532802%. FC
+ 0.116244 ms. 0.332647%. Add
+ 0.018641 ms. 0.0533437%. AveragePool
+ 0.0040904 ms. 0.0117052%. Squeeze
+ 34.9451 ms in Total
+FLOP per operator type:
+ 0.626272 GFLOP. 96.2088%. Conv
+ 0.0218266 GFLOP. 3.35303%. SpatialBN
+ 0.002561 GFLOP. 0.393424%. FC
+ 0.000291648 GFLOP. 0.0448034%. Add
+ 0 GFLOP. 0%. Relu
+ 0.650951 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.4354 MB. 41.3788%. Conv
+ 22.1299 MB. 26.5921%. SpatialBN
+ 19.1923 MB. 23.0622%. Relu
+ 5.12912 MB. 6.16333%. FC
+ 2.33318 MB. 2.80364%. Add
+ 83.2199 MB in Total
+Feature Memory Written per operator type:
+ 21.8266 MB. 34.0955%. Conv
+ 21.8266 MB. 34.0955%. SpatialBN
+ 19.1923 MB. 29.9805%. Relu
+ 1.16659 MB. 1.82234%. Add
+ 0.004 MB. 0.00624844%. FC
+ 64.016 MB in Total
+Parameter Memory per operator type:
+ 12.2576 MB. 69.9104%. Conv
+ 5.124 MB. 29.2245%. FC
+ 0.15168 MB. 0.865099%. SpatialBN
+ 0 MB. 0%. Add
+ 0 MB. 0%. Relu
+ 17.5332 MB in Total
+```
+
+### Optimized
+```
+Main run finished. Milliseconds per iter: 26.6364. Iters per second: 37.5426
+Time per operator type:
+ 24.9888 ms. 94.0962%. Conv
+ 1.26147 ms. 4.75011%. Relu
+ 0.176234 ms. 0.663619%. FC
+ 0.113309 ms. 0.426672%. Add
+ 0.0138708 ms. 0.0522311%. AveragePool
+ 0.00295685 ms. 0.0111341%. Squeeze
+ 26.5566 ms in Total
+FLOP per operator type:
+ 0.626272 GFLOP. 99.5466%. Conv
+ 0.002561 GFLOP. 0.407074%. FC
+ 0.000291648 GFLOP. 0.0463578%. Add
+ 0 GFLOP. 0%. Relu
+ 0.629124 GFLOP in Total
+Feature Memory Read per operator type:
+ 34.5112 MB. 56.4224%. Conv
+ 19.1923 MB. 31.3775%. Relu
+ 5.12912 MB. 8.3856%. FC
+ 2.33318 MB. 3.81452%. Add
+ 61.1658 MB in Total
+Feature Memory Written per operator type:
+ 21.8266 MB. 51.7346%. Conv
+ 19.1923 MB. 45.4908%. Relu
+ 1.16659 MB. 2.76513%. Add
+ 0.004 MB. 0.00948104%. FC
+ 42.1895 MB in Total
+Parameter Memory per operator type:
+ 12.2576 MB. 70.5205%. Conv
+ 5.124 MB. 29.4795%. FC
+ 0 MB. 0%. Add
+ 0 MB. 0%. Relu
+ 17.3816 MB in Total
+```
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9bd196e16a8e7775f480c8a1c0f5d035f87bbc22
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2020 Ross Wightman
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/README.md b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..df80c08487a078f40387e0af8633b65ee2af2738
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/README.md
@@ -0,0 +1,323 @@
+# (Generic) EfficientNets for PyTorch
+
+A 'generic' implementation of EfficientNet, MixNet, MobileNetV3, etc. that covers most of the compute/parameter efficient architectures derived from the MobileNet V1/V2 block sequence, including those found via automated neural architecture search.
+
+All models are implemented by GenEfficientNet or MobileNetV3 classes, with string based architecture definitions to configure the block layouts (idea from [here](https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py))
+
+## What's New
+
+### Aug 19, 2020
+* Add updated PyTorch trained EfficientNet-B3 weights trained by myself with `timm` (82.1 top-1)
+* Add PyTorch trained EfficientNet-Lite0 contributed by [@hal-314](https://github.com/hal-314) (75.5 top-1)
+* Update ONNX and Caffe2 export / utility scripts to work with latest PyTorch / ONNX
+* ONNX runtime based validation script added
+* activations (mostly) brought in sync with `timm` equivalents
+
+
+### April 5, 2020
+* Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite
+ * 3.5M param MobileNet-V2 100 @ 73%
+ * 4.5M param MobileNet-V2 110d @ 75%
+ * 6.1M param MobileNet-V2 140 @ 76.5%
+ * 5.8M param MobileNet-V2 120d @ 77.3%
+
+### March 23, 2020
+ * Add EfficientNet-Lite models w/ weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite)
+ * Add PyTorch trained MobileNet-V3 Large weights with 75.77% top-1
+ * IMPORTANT CHANGE (if training from scratch) - weight init changed to better match Tensorflow impl, set `fix_group_fanout=False` in `initialize_weight_goog` for old behavior
+
+### Feb 12, 2020
+ * Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)
+ * Port new EfficientNet-B8 (RandAugment) weights from TF TPU, these are different than the B8 AdvProp, different input normalization.
+ * Add RandAugment PyTorch trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by [Andrew Lavin](https://github.com/andravin)
+
+### Jan 22, 2020
+ * Update weights for EfficientNet B0, B2, B3 and MixNet-XL with latest RandAugment trained weights. Trained with (https://github.com/rwightman/pytorch-image-models)
+ * Fix torchscript compatibility for PyTorch 1.4, add torchscript support for MixedConv2d using ModuleDict
+ * Test models, torchscript, onnx export with PyTorch 1.4 -- no issues
+
+### Nov 22, 2019
+ * New top-1 high! Ported official TF EfficientNet AdvProp (https://arxiv.org/abs/1911.09665) weights and B8 model spec. Created a new set of `ap` models since they use a different
+ preprocessing (Inception mean/std) from the original EfficientNet base/AA/RA weights.
+
+### Nov 15, 2019
+ * Ported official TF MobileNet-V3 float32 large/small/minimalistic weights
+ * Modifications to MobileNet-V3 model and components to support some additional config needed for differences between TF MobileNet-V3 and mine
+
+### Oct 30, 2019
+ * Many of the models will now work with torch.jit.script, MixNet being the biggest exception
+ * Improved interface for enabling torchscript or ONNX export compatible modes (via config)
+ * Add JIT optimized mem-efficient Swish/Mish autograd.fn in addition to memory-efficient autgrad.fn
+ * Activation factory to select best version of activation by name or override one globally
+ * Add pretrained checkpoint load helper that handles input conv and classifier changes
+
+### Oct 27, 2019
+ * Add CondConv EfficientNet variants ported from https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
+ * Add RandAug weights for TF EfficientNet B5 and B7 from https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet
+ * Bring over MixNet-XL model and depth scaling algo from my pytorch-image-models code base
+ * Switch activations and global pooling to modules
+ * Add memory-efficient Swish/Mish impl
+ * Add as_sequential() method to all models and allow as an argument in entrypoint fns
+ * Move MobileNetV3 into own file since it has a different head
+ * Remove ChamNet, MobileNet V2/V1 since they will likely never be used here
+
+## Models
+
+Implemented models include:
+ * EfficientNet NoisyStudent (B0-B7, L2) (https://arxiv.org/abs/1911.04252)
+ * EfficientNet AdvProp (B0-B8) (https://arxiv.org/abs/1911.09665)
+ * EfficientNet (B0-B8) (https://arxiv.org/abs/1905.11946)
+ * EfficientNet-EdgeTPU (S, M, L) (https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html)
+ * EfficientNet-CondConv (https://arxiv.org/abs/1904.04971)
+ * EfficientNet-Lite (https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite)
+ * MixNet (https://arxiv.org/abs/1907.09595)
+ * MNASNet B1, A1 (Squeeze-Excite), and Small (https://arxiv.org/abs/1807.11626)
+ * MobileNet-V3 (https://arxiv.org/abs/1905.02244)
+ * FBNet-C (https://arxiv.org/abs/1812.03443)
+ * Single-Path NAS (https://arxiv.org/abs/1904.02877)
+
+I originally implemented and trained some these models with code [here](https://github.com/rwightman/pytorch-image-models), this repository contains just the GenEfficientNet models, validation, and associated ONNX/Caffe2 export code.
+
+## Pretrained
+
+I've managed to train several of the models to accuracies close to or above the originating papers and official impl. My training code is here: https://github.com/rwightman/pytorch-image-models
+
+
+|Model | Prec@1 (Err) | Prec@5 (Err) | Param#(M) | MAdds(M) | Image Scaling | Resolution | Crop |
+|---|---|---|---|---|---|---|---|
+| efficientnet_b3 | 82.240 (17.760) | 96.116 (3.884) | 12.23 | TBD | bicubic | 320 | 1.0 |
+| efficientnet_b3 | 82.076 (17.924) | 96.020 (3.980) | 12.23 | TBD | bicubic | 300 | 0.904 |
+| mixnet_xl | 81.074 (18.926) | 95.282 (4.718) | 11.90 | TBD | bicubic | 256 | 1.0 |
+| efficientnet_b2 | 80.612 (19.388) | 95.318 (4.682) | 9.1 | TBD | bicubic | 288 | 1.0 |
+| mixnet_xl | 80.476 (19.524) | 94.936 (5.064) | 11.90 | TBD | bicubic | 224 | 0.875 |
+| efficientnet_b2 | 80.288 (19.712) | 95.166 (4.834) | 9.1 | 1003 | bicubic | 260 | 0.890 |
+| mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33 | TBD | bicubic | 224 | 0.875 |
+| efficientnet_b1 | 78.692 (21.308) | 94.086 (5.914) | 7.8 | 694 | bicubic | 240 | 0.882 |
+| efficientnet_es | 78.066 (21.934) | 93.926 (6.074) | 5.44 | TBD | bicubic | 224 | 0.875 |
+| efficientnet_b0 | 77.698 (22.302) | 93.532 (6.468) | 5.3 | 390 | bicubic | 224 | 0.875 |
+| mobilenetv2_120d | 77.294 (22.706 | 93.502 (6.498) | 5.8 | TBD | bicubic | 224 | 0.875 |
+| mixnet_m | 77.256 (22.744) | 93.418 (6.582) | 5.01 | 353 | bicubic | 224 | 0.875 |
+| mobilenetv2_140 | 76.524 (23.476) | 92.990 (7.010) | 6.1 | TBD | bicubic | 224 | 0.875 |
+| mixnet_s | 75.988 (24.012) | 92.794 (7.206) | 4.13 | TBD | bicubic | 224 | 0.875 |
+| mobilenetv3_large_100 | 75.766 (24.234) | 92.542 (7.458) | 5.5 | TBD | bicubic | 224 | 0.875 |
+| mobilenetv3_rw | 75.634 (24.366) | 92.708 (7.292) | 5.5 | 219 | bicubic | 224 | 0.875 |
+| efficientnet_lite0 | 75.472 (24.528) | 92.520 (7.480) | 4.65 | TBD | bicubic | 224 | 0.875 |
+| mnasnet_a1 | 75.448 (24.552) | 92.604 (7.396) | 3.9 | 312 | bicubic | 224 | 0.875 |
+| fbnetc_100 | 75.124 (24.876) | 92.386 (7.614) | 5.6 | 385 | bilinear | 224 | 0.875 |
+| mobilenetv2_110d | 75.052 (24.948) | 92.180 (7.820) | 4.5 | TBD | bicubic | 224 | 0.875 |
+| mnasnet_b1 | 74.658 (25.342) | 92.114 (7.886) | 4.4 | 315 | bicubic | 224 | 0.875 |
+| spnasnet_100 | 74.084 (25.916) | 91.818 (8.182) | 4.4 | TBD | bilinear | 224 | 0.875 |
+| mobilenetv2_100 | 72.978 (27.022) | 91.016 (8.984) | 3.5 | TBD | bicubic | 224 | 0.875 |
+
+
+More pretrained models to come...
+
+
+## Ported Weights
+
+The weights ported from Tensorflow checkpoints for the EfficientNet models do pretty much match accuracy in Tensorflow once a SAME convolution padding equivalent is added, and the same crop factors, image scaling, etc (see table) are used via cmd line args.
+
+**IMPORTANT:**
+* Tensorflow ported weights for EfficientNet AdvProp (AP), EfficientNet EdgeTPU, EfficientNet-CondConv, EfficientNet-Lite, and MobileNet-V3 models use Inception style (0.5, 0.5, 0.5) for mean and std.
+* Enabling the Tensorflow preprocessing pipeline with `--tf-preprocessing` at validation time will improve scores by 0.1-0.5%, very close to original TF impl.
+
+To run validation for tf_efficientnet_b5:
+`python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b5 -b 64 --img-size 456 --crop-pct 0.934 --interpolation bicubic`
+
+To run validation w/ TF preprocessing for tf_efficientnet_b5:
+`python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b5 -b 64 --img-size 456 --tf-preprocessing`
+
+To run validation for a model with Inception preprocessing, ie EfficientNet-B8 AdvProp:
+`python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b8_ap -b 48 --num-gpu 2 --img-size 672 --crop-pct 0.954 --mean 0.5 --std 0.5`
+
+|Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling | Image Size | Crop |
+|---|---|---|---|---|---|---|
+| tf_efficientnet_l2_ns *tfp | 88.352 (11.648) | 98.652 (1.348) | 480 | bicubic | 800 | N/A |
+| tf_efficientnet_l2_ns | TBD | TBD | 480 | bicubic | 800 | 0.961 |
+| tf_efficientnet_l2_ns_475 | 88.234 (11.766) | 98.546 (1.454) | 480 | bicubic | 475 | 0.936 |
+| tf_efficientnet_l2_ns_475 *tfp | 88.172 (11.828) | 98.566 (1.434) | 480 | bicubic | 475 | N/A |
+| tf_efficientnet_b7_ns *tfp | 86.844 (13.156) | 98.084 (1.916) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b7_ns | 86.840 (13.160) | 98.094 (1.906) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b6_ns | 86.452 (13.548) | 97.882 (2.118) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b6_ns *tfp | 86.444 (13.556) | 97.880 (2.120) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b5_ns *tfp | 86.064 (13.936) | 97.746 (2.254) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b5_ns | 86.088 (13.912) | 97.752 (2.248) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b8_ap *tfp | 85.436 (14.564) | 97.272 (2.728) | 87.4 | bicubic | 672 | N/A |
+| tf_efficientnet_b8 *tfp | 85.384 (14.616) | 97.394 (2.606) | 87.4 | bicubic | 672 | N/A |
+| tf_efficientnet_b8 | 85.370 (14.630) | 97.390 (2.610) | 87.4 | bicubic | 672 | 0.954 |
+| tf_efficientnet_b8_ap | 85.368 (14.632) | 97.294 (2.706) | 87.4 | bicubic | 672 | 0.954 |
+| tf_efficientnet_b4_ns *tfp | 85.298 (14.702) | 97.504 (2.496) | 19.34 | bicubic | 380 | N/A |
+| tf_efficientnet_b4_ns | 85.162 (14.838) | 97.470 (2.530) | 19.34 | bicubic | 380 | 0.922 |
+| tf_efficientnet_b7_ap *tfp | 85.154 (14.846) | 97.244 (2.756) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b7_ap | 85.118 (14.882) | 97.252 (2.748) | 66.35 | bicubic | 600 | 0.949 |
+| tf_efficientnet_b7 *tfp | 84.940 (15.060) | 97.214 (2.786) | 66.35 | bicubic | 600 | N/A |
+| tf_efficientnet_b7 | 84.932 (15.068) | 97.208 (2.792) | 66.35 | bicubic | 600 | 0.949 |
+| tf_efficientnet_b6_ap | 84.786 (15.214) | 97.138 (2.862) | 43.04 | bicubic | 528 | 0.942 |
+| tf_efficientnet_b6_ap *tfp | 84.760 (15.240) | 97.124 (2.876) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b5_ap *tfp | 84.276 (15.724) | 96.932 (3.068) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b5_ap | 84.254 (15.746) | 96.976 (3.024) | 30.39 | bicubic | 456 | 0.934 |
+| tf_efficientnet_b6 *tfp | 84.140 (15.860) | 96.852 (3.148) | 43.04 | bicubic | 528 | N/A |
+| tf_efficientnet_b6 | 84.110 (15.890) | 96.886 (3.114) | 43.04 | bicubic | 528 | 0.942 |
+| tf_efficientnet_b3_ns *tfp | 84.054 (15.946) | 96.918 (3.082) | 12.23 | bicubic | 300 | N/A |
+| tf_efficientnet_b3_ns | 84.048 (15.952) | 96.910 (3.090) | 12.23 | bicubic | 300 | .904 |
+| tf_efficientnet_b5 *tfp | 83.822 (16.178) | 96.756 (3.244) | 30.39 | bicubic | 456 | N/A |
+| tf_efficientnet_b5 | 83.812 (16.188) | 96.748 (3.252) | 30.39 | bicubic | 456 | 0.934 |
+| tf_efficientnet_b4_ap *tfp | 83.278 (16.722) | 96.376 (3.624) | 19.34 | bicubic | 380 | N/A |
+| tf_efficientnet_b4_ap | 83.248 (16.752) | 96.388 (3.612) | 19.34 | bicubic | 380 | 0.922 |
+| tf_efficientnet_b4 | 83.022 (16.978) | 96.300 (3.700) | 19.34 | bicubic | 380 | 0.922 |
+| tf_efficientnet_b4 *tfp | 82.948 (17.052) | 96.308 (3.692) | 19.34 | bicubic | 380 | N/A |
+| tf_efficientnet_b2_ns *tfp | 82.436 (17.564) | 96.268 (3.732) | 9.11 | bicubic | 260 | N/A |
+| tf_efficientnet_b2_ns | 82.380 (17.620) | 96.248 (3.752) | 9.11 | bicubic | 260 | 0.89 |
+| tf_efficientnet_b3_ap *tfp | 81.882 (18.118) | 95.662 (4.338) | 12.23 | bicubic | 300 | N/A |
+| tf_efficientnet_b3_ap | 81.828 (18.172) | 95.624 (4.376) | 12.23 | bicubic | 300 | 0.904 |
+| tf_efficientnet_b3 | 81.636 (18.364) | 95.718 (4.282) | 12.23 | bicubic | 300 | 0.904 |
+| tf_efficientnet_b3 *tfp | 81.576 (18.424) | 95.662 (4.338) | 12.23 | bicubic | 300 | N/A |
+| tf_efficientnet_lite4 | 81.528 (18.472) | 95.668 (4.332) | 13.00 | bilinear | 380 | 0.92 |
+| tf_efficientnet_b1_ns *tfp | 81.514 (18.486) | 95.776 (4.224) | 7.79 | bicubic | 240 | N/A |
+| tf_efficientnet_lite4 *tfp | 81.502 (18.498) | 95.676 (4.324) | 13.00 | bilinear | 380 | N/A |
+| tf_efficientnet_b1_ns | 81.388 (18.612) | 95.738 (4.262) | 7.79 | bicubic | 240 | 0.88 |
+| tf_efficientnet_el | 80.534 (19.466) | 95.190 (4.810) | 10.59 | bicubic | 300 | 0.904 |
+| tf_efficientnet_el *tfp | 80.476 (19.524) | 95.200 (4.800) | 10.59 | bicubic | 300 | N/A |
+| tf_efficientnet_b2_ap *tfp | 80.420 (19.580) | 95.040 (4.960) | 9.11 | bicubic | 260 | N/A |
+| tf_efficientnet_b2_ap | 80.306 (19.694) | 95.028 (4.972) | 9.11 | bicubic | 260 | 0.890 |
+| tf_efficientnet_b2 *tfp | 80.188 (19.812) | 94.974 (5.026) | 9.11 | bicubic | 260 | N/A |
+| tf_efficientnet_b2 | 80.086 (19.914) | 94.908 (5.092) | 9.11 | bicubic | 260 | 0.890 |
+| tf_efficientnet_lite3 | 79.812 (20.188) | 94.914 (5.086) | 8.20 | bilinear | 300 | 0.904 |
+| tf_efficientnet_lite3 *tfp | 79.734 (20.266) | 94.838 (5.162) | 8.20 | bilinear | 300 | N/A |
+| tf_efficientnet_b1_ap *tfp | 79.532 (20.468) | 94.378 (5.622) | 7.79 | bicubic | 240 | N/A |
+| tf_efficientnet_cc_b1_8e *tfp | 79.464 (20.536)| 94.492 (5.508) | 39.7 | bicubic | 240 | 0.88 |
+| tf_efficientnet_cc_b1_8e | 79.298 (20.702) | 94.364 (5.636) | 39.7 | bicubic | 240 | 0.88 |
+| tf_efficientnet_b1_ap | 79.278 (20.722) | 94.308 (5.692) | 7.79 | bicubic | 240 | 0.88 |
+| tf_efficientnet_b1 *tfp | 79.172 (20.828) | 94.450 (5.550) | 7.79 | bicubic | 240 | N/A |
+| tf_efficientnet_em *tfp | 78.958 (21.042) | 94.458 (5.542) | 6.90 | bicubic | 240 | N/A |
+| tf_efficientnet_b0_ns *tfp | 78.806 (21.194) | 94.496 (5.504) | 5.29 | bicubic | 224 | N/A |
+| tf_mixnet_l *tfp | 78.846 (21.154) | 94.212 (5.788) | 7.33 | bilinear | 224 | N/A |
+| tf_efficientnet_b1 | 78.826 (21.174) | 94.198 (5.802) | 7.79 | bicubic | 240 | 0.88 |
+| tf_mixnet_l | 78.770 (21.230) | 94.004 (5.996) | 7.33 | bicubic | 224 | 0.875 |
+| tf_efficientnet_em | 78.742 (21.258) | 94.332 (5.668) | 6.90 | bicubic | 240 | 0.875 |
+| tf_efficientnet_b0_ns | 78.658 (21.342) | 94.376 (5.624) | 5.29 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_8e *tfp | 78.314 (21.686) | 93.790 (6.210) | 24.0 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_8e | 77.908 (22.092) | 93.656 (6.344) | 24.0 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_4e *tfp | 77.746 (22.254) | 93.552 (6.448) | 13.3 | bicubic | 224 | 0.875 |
+| tf_efficientnet_cc_b0_4e | 77.304 (22.696) | 93.332 (6.668) | 13.3 | bicubic | 224 | 0.875 |
+| tf_efficientnet_es *tfp | 77.616 (22.384) | 93.750 (6.250) | 5.44 | bicubic | 224 | N/A |
+| tf_efficientnet_lite2 *tfp | 77.544 (22.456) | 93.800 (6.200) | 6.09 | bilinear | 260 | N/A |
+| tf_efficientnet_lite2 | 77.460 (22.540) | 93.746 (6.254) | 6.09 | bicubic | 260 | 0.89 |
+| tf_efficientnet_b0_ap *tfp | 77.514 (22.486) | 93.576 (6.424) | 5.29 | bicubic | 224 | N/A |
+| tf_efficientnet_es | 77.264 (22.736) | 93.600 (6.400) | 5.44 | bicubic | 224 | N/A |
+| tf_efficientnet_b0 *tfp | 77.258 (22.742) | 93.478 (6.522) | 5.29 | bicubic | 224 | N/A |
+| tf_efficientnet_b0_ap | 77.084 (22.916) | 93.254 (6.746) | 5.29 | bicubic | 224 | 0.875 |
+| tf_mixnet_m *tfp | 77.072 (22.928) | 93.368 (6.632) | 5.01 | bilinear | 224 | N/A |
+| tf_mixnet_m | 76.950 (23.050) | 93.156 (6.844) | 5.01 | bicubic | 224 | 0.875 |
+| tf_efficientnet_b0 | 76.848 (23.152) | 93.228 (6.772) | 5.29 | bicubic | 224 | 0.875 |
+| tf_efficientnet_lite1 *tfp | 76.764 (23.236) | 93.326 (6.674) | 5.42 | bilinear | 240 | N/A |
+| tf_efficientnet_lite1 | 76.638 (23.362) | 93.232 (6.768) | 5.42 | bicubic | 240 | 0.882 |
+| tf_mixnet_s *tfp | 75.800 (24.200) | 92.788 (7.212) | 4.13 | bilinear | 224 | N/A |
+| tf_mobilenetv3_large_100 *tfp | 75.768 (24.232) | 92.710 (7.290) | 5.48 | bilinear | 224 | N/A |
+| tf_mixnet_s | 75.648 (24.352) | 92.636 (7.364) | 4.13 | bicubic | 224 | 0.875 |
+| tf_mobilenetv3_large_100 | 75.516 (24.484) | 92.600 (7.400) | 5.48 | bilinear | 224 | 0.875 |
+| tf_efficientnet_lite0 *tfp | 75.074 (24.926) | 92.314 (7.686) | 4.65 | bilinear | 224 | N/A |
+| tf_efficientnet_lite0 | 74.842 (25.158) | 92.170 (7.830) | 4.65 | bicubic | 224 | 0.875 |
+| tf_mobilenetv3_large_075 *tfp | 73.730 (26.270) | 91.616 (8.384) | 3.99 | bilinear | 224 |N/A |
+| tf_mobilenetv3_large_075 | 73.442 (26.558) | 91.352 (8.648) | 3.99 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_large_minimal_100 *tfp | 72.678 (27.322) | 90.860 (9.140) | 3.92 | bilinear | 224 | N/A |
+| tf_mobilenetv3_large_minimal_100 | 72.244 (27.756) | 90.636 (9.364) | 3.92 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_small_100 *tfp | 67.918 (32.082) | 87.958 (12.042 | 2.54 | bilinear | 224 | N/A |
+| tf_mobilenetv3_small_100 | 67.918 (32.082) | 87.662 (12.338) | 2.54 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_small_075 *tfp | 66.142 (33.858) | 86.498 (13.502) | 2.04 | bilinear | 224 | N/A |
+| tf_mobilenetv3_small_075 | 65.718 (34.282) | 86.136 (13.864) | 2.04 | bilinear | 224 | 0.875 |
+| tf_mobilenetv3_small_minimal_100 *tfp | 63.378 (36.622) | 84.802 (15.198) | 2.04 | bilinear | 224 | N/A |
+| tf_mobilenetv3_small_minimal_100 | 62.898 (37.102) | 84.230 (15.770) | 2.04 | bilinear | 224 | 0.875 |
+
+
+*tfp models validated with `tf-preprocessing` pipeline
+
+Google tf and tflite weights ported from official Tensorflow repositories
+* https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+* https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet
+* https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet
+
+## Usage
+
+### Environment
+
+All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x, 3.8.x.
+
+Users have reported that a Python 3 Anaconda install in Windows works. I have not verified this myself.
+
+PyTorch versions 1.4, 1.5, 1.6 have been tested with this code.
+
+I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda:
+```
+conda create -n torch-env
+conda activate torch-env
+conda install -c pytorch pytorch torchvision cudatoolkit=10.2
+```
+
+### PyTorch Hub
+
+Models can be accessed via the PyTorch Hub API
+
+```
+>>> torch.hub.list('rwightman/gen-efficientnet-pytorch')
+['efficientnet_b0', ...]
+>>> model = torch.hub.load('rwightman/gen-efficientnet-pytorch', 'efficientnet_b0', pretrained=True)
+>>> model.eval()
+>>> output = model(torch.randn(1,3,224,224))
+```
+
+### Pip
+This package can be installed via pip.
+
+Install (after conda env/install):
+```
+pip install geffnet
+```
+
+Eval use:
+```
+>>> import geffnet
+>>> m = geffnet.create_model('mobilenetv3_large_100', pretrained=True)
+>>> m.eval()
+```
+
+Train use:
+```
+>>> import geffnet
+>>> # models can also be created by using the entrypoint directly
+>>> m = geffnet.efficientnet_b2(pretrained=True, drop_rate=0.25, drop_connect_rate=0.2)
+>>> m.train()
+```
+
+Create in a nn.Sequential container, for fast.ai, etc:
+```
+>>> import geffnet
+>>> m = geffnet.mixnet_l(pretrained=True, drop_rate=0.25, drop_connect_rate=0.2, as_sequential=True)
+```
+
+### Exporting
+
+Scripts are included to
+* export models to ONNX (`onnx_export.py`)
+* optimized ONNX graph (`onnx_optimize.py` or `onnx_validate.py` w/ `--onnx-output-opt` arg)
+* validate with ONNX runtime (`onnx_validate.py`)
+* convert ONNX model to Caffe2 (`onnx_to_caffe.py`)
+* validate in Caffe2 (`caffe2_validate.py`)
+* benchmark in Caffe2 w/ FLOPs, parameters output (`caffe2_benchmark.py`)
+
+As an example, to export the MobileNet-V3 pretrained model and then run an Imagenet validation:
+```
+python onnx_export.py --model mobilenetv3_large_100 ./mobilenetv3_100.onnx
+python onnx_validate.py /imagenet/validation/ --onnx-input ./mobilenetv3_100.onnx
+```
+
+These scripts were tested to be working as of PyTorch 1.6 and ONNX 1.7 w/ ONNX runtime 1.4. Caffe2 compatible
+export now requires additional args mentioned in the export script (not needed in earlier versions).
+
+#### Export Notes
+1. The TF ported weights with the 'SAME' conv padding activated cannot be exported to ONNX unless `_EXPORTABLE` flag in `config.py` is set to True. Use `config.set_exportable(True)` as in the `onnx_export.py` script.
+2. TF ported models with 'SAME' padding will have the padding fixed at export time to the resolution used for export. Even though dynamic padding is supported in opset >= 11, I can't get it working.
+3. ONNX optimize facility doesn't work reliably in PyTorch 1.6 / ONNX 1.7. Fortunately, the onnxruntime based inference is working very well now and includes on the fly optimization.
+3. ONNX / Caffe2 export/import frequently breaks with different PyTorch and ONNX version releases. Please check their respective issue trackers before filing issues here.
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/caffe2_benchmark.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/caffe2_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb555acef42578e49430085c553678ca6feb0d1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/caffe2_benchmark.py
@@ -0,0 +1,65 @@
+""" Caffe2 validation script
+
+This script runs Caffe2 benchmark on exported ONNX model.
+It is a useful tool for reporting model FLOPS.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+from caffe2.python import core, workspace, model_helper
+from caffe2.proto import caffe2_pb2
+
+
+parser = argparse.ArgumentParser(description='Caffe2 Model Benchmark')
+parser.add_argument('--c2-prefix', default='', type=str, metavar='NAME',
+ help='caffe2 model pb name prefix')
+parser.add_argument('--c2-init', default='', type=str, metavar='PATH',
+ help='caffe2 model init .pb')
+parser.add_argument('--c2-predict', default='', type=str, metavar='PATH',
+ help='caffe2 model predict .pb')
+parser.add_argument('-b', '--batch-size', default=1, type=int,
+ metavar='N', help='mini-batch size (default: 1)')
+parser.add_argument('--img-size', default=224, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+
+
+def main():
+ args = parser.parse_args()
+ args.gpu_id = 0
+ if args.c2_prefix:
+ args.c2_init = args.c2_prefix + '.init.pb'
+ args.c2_predict = args.c2_prefix + '.predict.pb'
+
+ model = model_helper.ModelHelper(name="le_net", init_params=False)
+
+ # Bring in the init net from init_net.pb
+ init_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_init, "rb") as f:
+ init_net_proto.ParseFromString(f.read())
+ model.param_init_net = core.Net(init_net_proto)
+
+ # bring in the predict net from predict_net.pb
+ predict_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_predict, "rb") as f:
+ predict_net_proto.ParseFromString(f.read())
+ model.net = core.Net(predict_net_proto)
+
+ # CUDA performance not impressive
+ #device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id)
+ #model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+ #model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+
+ input_blob = model.net.external_inputs[0]
+ model.param_init_net.GaussianFill(
+ [],
+ input_blob.GetUnscopedName(),
+ shape=(args.batch_size, 3, args.img_size, args.img_size),
+ mean=0.0,
+ std=1.0)
+ workspace.RunNetOnce(model.param_init_net)
+ workspace.CreateNet(model.net, overwrite=True)
+ workspace.BenchmarkNet(model.net.Proto().name, 5, 20, True)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/caffe2_validate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/caffe2_validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..2459648ec15c5ec0642ef35418c22c575b9391ac
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/caffe2_validate.py
@@ -0,0 +1,138 @@
+""" Caffe2 validation script
+
+This script is created to verify exported ONNX models running in Caffe2
+It utilizes the same PyTorch dataloader/processing pipeline for a
+fair comparison against the originals.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import numpy as np
+from caffe2.python import core, workspace, model_helper
+from caffe2.proto import caffe2_pb2
+from data import create_loader, resolve_data_config, Dataset
+from utils import AverageMeter
+import time
+
+parser = argparse.ArgumentParser(description='Caffe2 ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--c2-prefix', default='', type=str, metavar='NAME',
+ help='caffe2 model pb name prefix')
+parser.add_argument('--c2-init', default='', type=str, metavar='PATH',
+ help='caffe2 model init .pb')
+parser.add_argument('--c2-predict', default='', type=str, metavar='PATH',
+ help='caffe2 model predict .pb')
+parser.add_argument('-j', '--workers', default=2, type=int, metavar='N',
+ help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+ metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT',
+ help='Override default crop pct of 0.875')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true',
+ help='use tensorflow mnasnet preporcessing')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+ metavar='N', help='print frequency (default: 10)')
+
+
+def main():
+ args = parser.parse_args()
+ args.gpu_id = 0
+ if args.c2_prefix:
+ args.c2_init = args.c2_prefix + '.init.pb'
+ args.c2_predict = args.c2_prefix + '.predict.pb'
+
+ model = model_helper.ModelHelper(name="validation_net", init_params=False)
+
+ # Bring in the init net from init_net.pb
+ init_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_init, "rb") as f:
+ init_net_proto.ParseFromString(f.read())
+ model.param_init_net = core.Net(init_net_proto)
+
+ # bring in the predict net from predict_net.pb
+ predict_net_proto = caffe2_pb2.NetDef()
+ with open(args.c2_predict, "rb") as f:
+ predict_net_proto.ParseFromString(f.read())
+ model.net = core.Net(predict_net_proto)
+
+ data_config = resolve_data_config(None, args)
+ loader = create_loader(
+ Dataset(args.data, load_bytes=args.tf_preprocessing),
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ use_prefetcher=False,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ crop_pct=data_config['crop_pct'],
+ tensorflow_preprocessing=args.tf_preprocessing)
+
+ # this is so obvious, wonderful interface
+ input_blob = model.net.external_inputs[0]
+ output_blob = model.net.external_outputs[0]
+
+ if True:
+ device_opts = None
+ else:
+ # CUDA is crashing, no idea why, awesome error message, give it a try for kicks
+ device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id)
+ model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+ model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True)
+
+ model.param_init_net.GaussianFill(
+ [], input_blob.GetUnscopedName(),
+ shape=(1,) + data_config['input_size'], mean=0.0, std=1.0)
+ workspace.RunNetOnce(model.param_init_net)
+ workspace.CreateNet(model.net, overwrite=True)
+
+ batch_time = AverageMeter()
+ top1 = AverageMeter()
+ top5 = AverageMeter()
+ end = time.time()
+ for i, (input, target) in enumerate(loader):
+ # run the net and return prediction
+ caffe2_in = input.data.numpy()
+ workspace.FeedBlob(input_blob, caffe2_in, device_opts)
+ workspace.RunNet(model.net, num_iter=1)
+ output = workspace.FetchBlob(output_blob)
+
+ # measure accuracy and record loss
+ prec1, prec5 = accuracy_np(output.data, target.numpy())
+ top1.update(prec1.item(), input.size(0))
+ top5.update(prec5.item(), input.size(0))
+
+ # measure elapsed time
+ batch_time.update(time.time() - end)
+ end = time.time()
+
+ if i % args.print_freq == 0:
+ print('Test: [{0}/{1}]\t'
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s, {ms_avg:.3f} ms/sample) \t'
+ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+ 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+ i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg,
+ ms_avg=100 * batch_time.avg / input.size(0), top1=top1, top5=top5))
+
+ print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format(
+ top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg))
+
+
+def accuracy_np(output, target):
+ max_indices = np.argsort(output, axis=1)[:, ::-1]
+ top5 = 100 * np.equal(max_indices[:, :5], target[:, np.newaxis]).sum(axis=1).mean()
+ top1 = 100 * np.equal(max_indices[:, 0], target).mean()
+ return top1, top5
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c53c1e046d1a5de6d11400f3b294f834d3a2b3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/__init__.py
@@ -0,0 +1,5 @@
+from .gen_efficientnet import *
+from .mobilenetv3 import *
+from .model_factory import create_model
+from .config import is_exportable, is_scriptable, set_exportable, set_scriptable
+from .activations import *
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2167fd724fe17450444f8e79b12bbdb6b0b37ebd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/__init__.py
@@ -0,0 +1,137 @@
+from geffnet import config
+from geffnet.activations.activations_me import *
+from geffnet.activations.activations_jit import *
+from geffnet.activations.activations import *
+import torch
+
+_has_silu = 'silu' in dir(torch.nn.functional)
+
+_ACT_FN_DEFAULT = dict(
+ silu=F.silu if _has_silu else swish,
+ swish=F.silu if _has_silu else swish,
+ mish=mish,
+ relu=F.relu,
+ relu6=F.relu6,
+ sigmoid=sigmoid,
+ tanh=tanh,
+ hard_sigmoid=hard_sigmoid,
+ hard_swish=hard_swish,
+)
+
+_ACT_FN_JIT = dict(
+ silu=F.silu if _has_silu else swish_jit,
+ swish=F.silu if _has_silu else swish_jit,
+ mish=mish_jit,
+)
+
+_ACT_FN_ME = dict(
+ silu=F.silu if _has_silu else swish_me,
+ swish=F.silu if _has_silu else swish_me,
+ mish=mish_me,
+ hard_swish=hard_swish_me,
+ hard_sigmoid_jit=hard_sigmoid_me,
+)
+
+_ACT_LAYER_DEFAULT = dict(
+ silu=nn.SiLU if _has_silu else Swish,
+ swish=nn.SiLU if _has_silu else Swish,
+ mish=Mish,
+ relu=nn.ReLU,
+ relu6=nn.ReLU6,
+ sigmoid=Sigmoid,
+ tanh=Tanh,
+ hard_sigmoid=HardSigmoid,
+ hard_swish=HardSwish,
+)
+
+_ACT_LAYER_JIT = dict(
+ silu=nn.SiLU if _has_silu else SwishJit,
+ swish=nn.SiLU if _has_silu else SwishJit,
+ mish=MishJit,
+)
+
+_ACT_LAYER_ME = dict(
+ silu=nn.SiLU if _has_silu else SwishMe,
+ swish=nn.SiLU if _has_silu else SwishMe,
+ mish=MishMe,
+ hard_swish=HardSwishMe,
+ hard_sigmoid=HardSigmoidMe
+)
+
+_OVERRIDE_FN = dict()
+_OVERRIDE_LAYER = dict()
+
+
+def add_override_act_fn(name, fn):
+ global _OVERRIDE_FN
+ _OVERRIDE_FN[name] = fn
+
+
+def update_override_act_fn(overrides):
+ assert isinstance(overrides, dict)
+ global _OVERRIDE_FN
+ _OVERRIDE_FN.update(overrides)
+
+
+def clear_override_act_fn():
+ global _OVERRIDE_FN
+ _OVERRIDE_FN = dict()
+
+
+def add_override_act_layer(name, fn):
+ _OVERRIDE_LAYER[name] = fn
+
+
+def update_override_act_layer(overrides):
+ assert isinstance(overrides, dict)
+ global _OVERRIDE_LAYER
+ _OVERRIDE_LAYER.update(overrides)
+
+
+def clear_override_act_layer():
+ global _OVERRIDE_LAYER
+ _OVERRIDE_LAYER = dict()
+
+
+def get_act_fn(name='relu'):
+ """ Activation Function Factory
+ Fetching activation fns by name with this function allows export or torch script friendly
+ functions to be returned dynamically based on current config.
+ """
+ if name in _OVERRIDE_FN:
+ return _OVERRIDE_FN[name]
+ use_me = not (config.is_exportable() or config.is_scriptable() or config.is_no_jit())
+ if use_me and name in _ACT_FN_ME:
+ # If not exporting or scripting the model, first look for a memory optimized version
+ # activation with custom autograd, then fallback to jit scripted, then a Python or Torch builtin
+ return _ACT_FN_ME[name]
+ if config.is_exportable() and name in ('silu', 'swish'):
+ # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
+ return swish
+ use_jit = not (config.is_exportable() or config.is_no_jit())
+ # NOTE: export tracing should work with jit scripted components, but I keep running into issues
+ if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting
+ return _ACT_FN_JIT[name]
+ return _ACT_FN_DEFAULT[name]
+
+
+def get_act_layer(name='relu'):
+ """ Activation Layer Factory
+ Fetching activation layers by name with this function allows export or torch script friendly
+ functions to be returned dynamically based on current config.
+ """
+ if name in _OVERRIDE_LAYER:
+ return _OVERRIDE_LAYER[name]
+ use_me = not (config.is_exportable() or config.is_scriptable() or config.is_no_jit())
+ if use_me and name in _ACT_LAYER_ME:
+ return _ACT_LAYER_ME[name]
+ if config.is_exportable() and name in ('silu', 'swish'):
+ # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
+ return Swish
+ use_jit = not (config.is_exportable() or config.is_no_jit())
+ # NOTE: export tracing should work with jit scripted components, but I keep running into issues
+ if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting
+ return _ACT_LAYER_JIT[name]
+ return _ACT_LAYER_DEFAULT[name]
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0ae1758b9537a0c200a1ff9cb4824efb5258ea9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations.py
@@ -0,0 +1,102 @@
+""" Activations
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+Copyright 2020 Ross Wightman
+"""
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+def swish(x, inplace: bool = False):
+ """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
+ and also as Swish (https://arxiv.org/abs/1710.05941).
+
+ TODO Rename to SiLU with addition to PyTorch
+ """
+ return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
+
+
+class Swish(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Swish, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return swish(x, self.inplace)
+
+
+def mish(x, inplace: bool = False):
+ """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+ """
+ return x.mul(F.softplus(x).tanh())
+
+
+class Mish(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Mish, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return mish(x, self.inplace)
+
+
+def sigmoid(x, inplace: bool = False):
+ return x.sigmoid_() if inplace else x.sigmoid()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Sigmoid(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Sigmoid, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return x.sigmoid_() if self.inplace else x.sigmoid()
+
+
+def tanh(x, inplace: bool = False):
+ return x.tanh_() if inplace else x.tanh()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Tanh(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(Tanh, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return x.tanh_() if self.inplace else x.tanh()
+
+
+def hard_swish(x, inplace: bool = False):
+ inner = F.relu6(x + 3.).div_(6.)
+ return x.mul_(inner) if inplace else x.mul(inner)
+
+
+class HardSwish(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSwish, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return hard_swish(x, self.inplace)
+
+
+def hard_sigmoid(x, inplace: bool = False):
+ if inplace:
+ return x.add_(3.).clamp_(0., 6.).div_(6.)
+ else:
+ return F.relu6(x + 3.) / 6.
+
+
+class HardSigmoid(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSigmoid, self).__init__()
+ self.inplace = inplace
+
+ def forward(self, x):
+ return hard_sigmoid(x, self.inplace)
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations_jit.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25d0fa87db91b75f1346f5579090687287dd025
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations_jit.py
@@ -0,0 +1,79 @@
+""" Activations (jit)
+
+A collection of jit-scripted activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not
+currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted
+versions if they contain in-place ops.
+
+Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+__all__ = ['swish_jit', 'SwishJit', 'mish_jit', 'MishJit',
+ 'hard_sigmoid_jit', 'HardSigmoidJit', 'hard_swish_jit', 'HardSwishJit']
+
+
+@torch.jit.script
+def swish_jit(x, inplace: bool = False):
+ """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
+ and also as Swish (https://arxiv.org/abs/1710.05941).
+
+ TODO Rename to SiLU with addition to PyTorch
+ """
+ return x.mul(x.sigmoid())
+
+
+@torch.jit.script
+def mish_jit(x, _inplace: bool = False):
+ """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+ """
+ return x.mul(F.softplus(x).tanh())
+
+
+class SwishJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(SwishJit, self).__init__()
+
+ def forward(self, x):
+ return swish_jit(x)
+
+
+class MishJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(MishJit, self).__init__()
+
+ def forward(self, x):
+ return mish_jit(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit(x, inplace: bool = False):
+ # return F.relu6(x + 3.) / 6.
+ return (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster?
+
+
+class HardSigmoidJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSigmoidJit, self).__init__()
+
+ def forward(self, x):
+ return hard_sigmoid_jit(x)
+
+
+@torch.jit.script
+def hard_swish_jit(x, inplace: bool = False):
+ # return x * (F.relu6(x + 3.) / 6)
+ return x * (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster?
+
+
+class HardSwishJit(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSwishJit, self).__init__()
+
+ def forward(self, x):
+ return hard_swish_jit(x)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations_me.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations_me.py
new file mode 100644
index 0000000000000000000000000000000000000000..45dc472a1f8d3c3539fca746124482ade24c8613
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/activations/activations_me.py
@@ -0,0 +1,174 @@
+""" Activations (memory-efficient w/ custom autograd)
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+These activations are not compatible with jit scripting or ONNX export of the model, please use either
+the JIT or basic versions of the activations.
+
+Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+__all__ = ['swish_me', 'SwishMe', 'mish_me', 'MishMe',
+ 'hard_sigmoid_me', 'HardSigmoidMe', 'hard_swish_me', 'HardSwishMe']
+
+
+@torch.jit.script
+def swish_jit_fwd(x):
+ return x.mul(torch.sigmoid(x))
+
+
+@torch.jit.script
+def swish_jit_bwd(x, grad_output):
+ x_sigmoid = torch.sigmoid(x)
+ return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
+
+
+class SwishJitAutoFn(torch.autograd.Function):
+ """ torch.jit.script optimised Swish w/ memory-efficient checkpoint
+ Inspired by conversation btw Jeremy Howard & Adam Pazske
+ https://twitter.com/jeremyphoward/status/1188251041835315200
+
+ Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
+ and also as Swish (https://arxiv.org/abs/1710.05941).
+
+ TODO Rename to SiLU with addition to PyTorch
+ """
+
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return swish_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return swish_jit_bwd(x, grad_output)
+
+
+def swish_me(x, inplace=False):
+ return SwishJitAutoFn.apply(x)
+
+
+class SwishMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(SwishMe, self).__init__()
+
+ def forward(self, x):
+ return SwishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def mish_jit_fwd(x):
+ return x.mul(torch.tanh(F.softplus(x)))
+
+
+@torch.jit.script
+def mish_jit_bwd(x, grad_output):
+ x_sigmoid = torch.sigmoid(x)
+ x_tanh_sp = F.softplus(x).tanh()
+ return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+
+class MishJitAutoFn(torch.autograd.Function):
+ """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+ A memory efficient, jit scripted variant of Mish
+ """
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return mish_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return mish_jit_bwd(x, grad_output)
+
+
+def mish_me(x, inplace=False):
+ return MishJitAutoFn.apply(x)
+
+
+class MishMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(MishMe, self).__init__()
+
+ def forward(self, x):
+ return MishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_fwd(x, inplace: bool = False):
+ return (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_bwd(x, grad_output):
+ m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6.
+ return grad_output * m
+
+
+class HardSigmoidJitAutoFn(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return hard_sigmoid_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return hard_sigmoid_jit_bwd(x, grad_output)
+
+
+def hard_sigmoid_me(x, inplace: bool = False):
+ return HardSigmoidJitAutoFn.apply(x)
+
+
+class HardSigmoidMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSigmoidMe, self).__init__()
+
+ def forward(self, x):
+ return HardSigmoidJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_swish_jit_fwd(x):
+ return x * (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_swish_jit_bwd(x, grad_output):
+ m = torch.ones_like(x) * (x >= 3.)
+ m = torch.where((x >= -3.) & (x <= 3.), x / 3. + .5, m)
+ return grad_output * m
+
+
+class HardSwishJitAutoFn(torch.autograd.Function):
+ """A memory efficient, jit-scripted HardSwish activation"""
+ @staticmethod
+ def forward(ctx, x):
+ ctx.save_for_backward(x)
+ return hard_swish_jit_fwd(x)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_tensors[0]
+ return hard_swish_jit_bwd(x, grad_output)
+
+
+def hard_swish_me(x, inplace=False):
+ return HardSwishJitAutoFn.apply(x)
+
+
+class HardSwishMe(nn.Module):
+ def __init__(self, inplace: bool = False):
+ super(HardSwishMe, self).__init__()
+
+ def forward(self, x):
+ return HardSwishJitAutoFn.apply(x)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/config.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ad21bbcbbc28f1b490b930ec369ccbf87f122d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/config.py
@@ -0,0 +1,123 @@
+""" Global layer config state
+"""
+from typing import Any, Optional
+
+__all__ = [
+ 'is_exportable', 'is_scriptable', 'is_no_jit', 'layer_config_kwargs',
+ 'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config'
+]
+
+# Set to True if prefer to have layers with no jit optimization (includes activations)
+_NO_JIT = False
+
+# Set to True if prefer to have activation layers with no jit optimization
+# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying
+# the jit flags so far are activations. This will change as more layers are updated and/or added.
+_NO_ACTIVATION_JIT = False
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+# Set to True if wanting to use torch.jit.script on a model
+_SCRIPTABLE = False
+
+
+def is_no_jit():
+ return _NO_JIT
+
+
+class set_no_jit:
+ def __init__(self, mode: bool) -> None:
+ global _NO_JIT
+ self.prev = _NO_JIT
+ _NO_JIT = mode
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _NO_JIT
+ _NO_JIT = self.prev
+ return False
+
+
+def is_exportable():
+ return _EXPORTABLE
+
+
+class set_exportable:
+ def __init__(self, mode: bool) -> None:
+ global _EXPORTABLE
+ self.prev = _EXPORTABLE
+ _EXPORTABLE = mode
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _EXPORTABLE
+ _EXPORTABLE = self.prev
+ return False
+
+
+def is_scriptable():
+ return _SCRIPTABLE
+
+
+class set_scriptable:
+ def __init__(self, mode: bool) -> None:
+ global _SCRIPTABLE
+ self.prev = _SCRIPTABLE
+ _SCRIPTABLE = mode
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _SCRIPTABLE
+ _SCRIPTABLE = self.prev
+ return False
+
+
+class set_layer_config:
+ """ Layer config context manager that allows setting all layer config flags at once.
+ If a flag arg is None, it will not change the current value.
+ """
+ def __init__(
+ self,
+ scriptable: Optional[bool] = None,
+ exportable: Optional[bool] = None,
+ no_jit: Optional[bool] = None,
+ no_activation_jit: Optional[bool] = None):
+ global _SCRIPTABLE
+ global _EXPORTABLE
+ global _NO_JIT
+ global _NO_ACTIVATION_JIT
+ self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT
+ if scriptable is not None:
+ _SCRIPTABLE = scriptable
+ if exportable is not None:
+ _EXPORTABLE = exportable
+ if no_jit is not None:
+ _NO_JIT = no_jit
+ if no_activation_jit is not None:
+ _NO_ACTIVATION_JIT = no_activation_jit
+
+ def __enter__(self) -> None:
+ pass
+
+ def __exit__(self, *args: Any) -> bool:
+ global _SCRIPTABLE
+ global _EXPORTABLE
+ global _NO_JIT
+ global _NO_ACTIVATION_JIT
+ _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev
+ return False
+
+
+def layer_config_kwargs(kwargs):
+ """ Consume config kwargs and return contextmgr obj """
+ return set_layer_config(
+ scriptable=kwargs.pop('scriptable', None),
+ exportable=kwargs.pop('exportable', None),
+ no_jit=kwargs.pop('no_jit', None))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/conv2d_layers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/conv2d_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49afbf0baa39a605cdf78d9e52d5f697b643370
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/conv2d_layers.py
@@ -0,0 +1,304 @@
+""" Conv2D w/ SAME padding, CondConv, MixedConv
+
+A collection of conv layers and padding helpers needed by EfficientNet, MixNet, and
+MobileNetV3 models that maintain weight compatibility with original Tensorflow models.
+
+Copyright 2020 Ross Wightman
+"""
+import collections.abc
+import math
+from functools import partial
+from itertools import repeat
+from typing import Tuple, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import *
+
+
+# From PyTorch internals
+def _ntuple(n):
+ def parse(x):
+ if isinstance(x, collections.abc.Iterable):
+ return x
+ return tuple(repeat(x, n))
+ return parse
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
+ return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+def _get_padding(kernel_size, stride=1, dilation=1, **_):
+ padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+ return padding
+
+
+def _calc_same_pad(i: int, k: int, s: int, d: int):
+ return max((-(i // -s) - 1) * s + (k - 1) * d + 1 - i, 0)
+
+
+def _same_pad_arg(input_size, kernel_size, stride, dilation):
+ ih, iw = input_size
+ kh, kw = kernel_size
+ pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
+ pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
+ return [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
+
+
+def _split_channels(num_chan, num_groups):
+ split = [num_chan // num_groups for _ in range(num_groups)]
+ split[0] += num_chan - sum(split)
+ return split
+
+
+def conv2d_same(
+ x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
+ padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
+ ih, iw = x.size()[-2:]
+ kh, kw = weight.size()[-2:]
+ pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
+ pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
+ x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+ return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
+
+
+class Conv2dSame(nn.Conv2d):
+ """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+ """
+
+ # pylint: disable=unused-argument
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+ padding=0, dilation=1, groups=1, bias=True):
+ super(Conv2dSame, self).__init__(
+ in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+
+ def forward(self, x):
+ return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class Conv2dSameExport(nn.Conv2d):
+ """ ONNX export friendly Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+
+ NOTE: This does not currently work with torch.jit.script
+ """
+
+ # pylint: disable=unused-argument
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+ padding=0, dilation=1, groups=1, bias=True):
+ super(Conv2dSameExport, self).__init__(
+ in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+ self.pad = None
+ self.pad_input_size = (0, 0)
+
+ def forward(self, x):
+ input_size = x.size()[-2:]
+ if self.pad is None:
+ pad_arg = _same_pad_arg(input_size, self.weight.size()[-2:], self.stride, self.dilation)
+ self.pad = nn.ZeroPad2d(pad_arg)
+ self.pad_input_size = input_size
+
+ if self.pad is not None:
+ x = self.pad(x)
+ return F.conv2d(
+ x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+def get_padding_value(padding, kernel_size, **kwargs):
+ dynamic = False
+ if isinstance(padding, str):
+ # for any string padding, the padding will be calculated for you, one of three ways
+ padding = padding.lower()
+ if padding == 'same':
+ # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+ if _is_static_pad(kernel_size, **kwargs):
+ # static case, no extra overhead
+ padding = _get_padding(kernel_size, **kwargs)
+ else:
+ # dynamic padding
+ padding = 0
+ dynamic = True
+ elif padding == 'valid':
+ # 'VALID' padding, same as padding=0
+ padding = 0
+ else:
+ # Default to PyTorch style 'same'-ish symmetric padding
+ padding = _get_padding(kernel_size, **kwargs)
+ return padding, dynamic
+
+
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+ padding = kwargs.pop('padding', '')
+ kwargs.setdefault('bias', False)
+ padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+ if is_dynamic:
+ if is_exportable():
+ assert not is_scriptable()
+ return Conv2dSameExport(in_chs, out_chs, kernel_size, **kwargs)
+ else:
+ return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+ else:
+ return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+
+
+class MixedConv2d(nn.ModuleDict):
+ """ Mixed Grouped Convolution
+ Based on MDConv and GroupedConv in MixNet impl:
+ https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+ """
+
+ def __init__(self, in_channels, out_channels, kernel_size=3,
+ stride=1, padding='', dilation=1, depthwise=False, **kwargs):
+ super(MixedConv2d, self).__init__()
+
+ kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
+ num_groups = len(kernel_size)
+ in_splits = _split_channels(in_channels, num_groups)
+ out_splits = _split_channels(out_channels, num_groups)
+ self.in_channels = sum(in_splits)
+ self.out_channels = sum(out_splits)
+ for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
+ conv_groups = out_ch if depthwise else 1
+ self.add_module(
+ str(idx),
+ create_conv2d_pad(
+ in_ch, out_ch, k, stride=stride,
+ padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
+ )
+ self.splits = in_splits
+
+ def forward(self, x):
+ x_split = torch.split(x, self.splits, 1)
+ x_out = [conv(x_split[i]) for i, conv in enumerate(self.values())]
+ x = torch.cat(x_out, 1)
+ return x
+
+
+def get_condconv_initializer(initializer, num_experts, expert_shape):
+ def condconv_initializer(weight):
+ """CondConv initializer function."""
+ num_params = np.prod(expert_shape)
+ if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
+ weight.shape[1] != num_params):
+ raise (ValueError(
+ 'CondConv variables must have shape [num_experts, num_params]'))
+ for i in range(num_experts):
+ initializer(weight[i].view(expert_shape))
+ return condconv_initializer
+
+
+class CondConv2d(nn.Module):
+ """ Conditional Convolution
+ Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+
+ Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
+ https://github.com/pytorch/pytorch/issues/17983
+ """
+ __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
+
+ def __init__(self, in_channels, out_channels, kernel_size=3,
+ stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
+ super(CondConv2d, self).__init__()
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = _pair(kernel_size)
+ self.stride = _pair(stride)
+ padding_val, is_padding_dynamic = get_padding_value(
+ padding, kernel_size, stride=stride, dilation=dilation)
+ self.dynamic_padding = is_padding_dynamic # if in forward to work with torchscript
+ self.padding = _pair(padding_val)
+ self.dilation = _pair(dilation)
+ self.groups = groups
+ self.num_experts = num_experts
+
+ self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
+ weight_num_param = 1
+ for wd in self.weight_shape:
+ weight_num_param *= wd
+ self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
+
+ if bias:
+ self.bias_shape = (self.out_channels,)
+ self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
+ else:
+ self.register_parameter('bias', None)
+
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ init_weight = get_condconv_initializer(
+ partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
+ init_weight(self.weight)
+ if self.bias is not None:
+ fan_in = np.prod(self.weight_shape[1:])
+ bound = 1 / math.sqrt(fan_in)
+ init_bias = get_condconv_initializer(
+ partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
+ init_bias(self.bias)
+
+ def forward(self, x, routing_weights):
+ B, C, H, W = x.shape
+ weight = torch.matmul(routing_weights, self.weight)
+ new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+ weight = weight.view(new_weight_shape)
+ bias = None
+ if self.bias is not None:
+ bias = torch.matmul(routing_weights, self.bias)
+ bias = bias.view(B * self.out_channels)
+ # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+ x = x.view(1, B * C, H, W)
+ if self.dynamic_padding:
+ out = conv2d_same(
+ x, weight, bias, stride=self.stride, padding=self.padding,
+ dilation=self.dilation, groups=self.groups * B)
+ else:
+ out = F.conv2d(
+ x, weight, bias, stride=self.stride, padding=self.padding,
+ dilation=self.dilation, groups=self.groups * B)
+ out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+ # Literal port (from TF definition)
+ # x = torch.split(x, 1, 0)
+ # weight = torch.split(weight, 1, 0)
+ # if self.bias is not None:
+ # bias = torch.matmul(routing_weights, self.bias)
+ # bias = torch.split(bias, 1, 0)
+ # else:
+ # bias = [None] * B
+ # out = []
+ # for xi, wi, bi in zip(x, weight, bias):
+ # wi = wi.view(*self.weight_shape)
+ # if bi is not None:
+ # bi = bi.view(*self.bias_shape)
+ # out.append(self.conv_fn(
+ # xi, wi, bi, stride=self.stride, padding=self.padding,
+ # dilation=self.dilation, groups=self.groups))
+ # out = torch.cat(out, 0)
+ return out
+
+
+def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+ assert 'groups' not in kwargs # only use 'depthwise' bool arg
+ if isinstance(kernel_size, list):
+ assert 'num_experts' not in kwargs # MixNet + CondConv combo not supported currently
+ # We're going to use only lists for defining the MixedConv2d kernel groups,
+ # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
+ m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+ else:
+ depthwise = kwargs.pop('depthwise', False)
+ groups = out_chs if depthwise else 1
+ if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
+ m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+ else:
+ m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+ return m
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/efficientnet_builder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/efficientnet_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..56df5ae5a8038ad8cc4965d2e50c7bfc48b2f292
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/efficientnet_builder.py
@@ -0,0 +1,683 @@
+""" EfficientNet / MobileNetV3 Blocks and Builder
+
+Copyright 2020 Ross Wightman
+"""
+import re
+from copy import deepcopy
+
+from .conv2d_layers import *
+from geffnet.activations import *
+
+__all__ = ['get_bn_args_tf', 'resolve_bn_args', 'resolve_se_args', 'resolve_act_layer', 'make_divisible',
+ 'round_channels', 'drop_connect', 'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv',
+ 'InvertedResidual', 'CondConvResidual', 'EdgeResidual', 'EfficientNetBuilder', 'decode_arch_def',
+ 'initialize_weight_default', 'initialize_weight_goog', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT'
+]
+
+# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
+# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
+# NOTE: momentum varies btw .99 and .9997 depending on source
+# .99 in official TF TPU impl
+# .9997 (/w .999 in search space) for paper
+#
+# PyTorch defaults are momentum = .1, eps = 1e-5
+#
+BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
+BN_EPS_TF_DEFAULT = 1e-3
+_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)
+
+
+def get_bn_args_tf():
+ return _BN_ARGS_TF.copy()
+
+
+def resolve_bn_args(kwargs):
+ bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
+ bn_momentum = kwargs.pop('bn_momentum', None)
+ if bn_momentum is not None:
+ bn_args['momentum'] = bn_momentum
+ bn_eps = kwargs.pop('bn_eps', None)
+ if bn_eps is not None:
+ bn_args['eps'] = bn_eps
+ return bn_args
+
+
+_SE_ARGS_DEFAULT = dict(
+ gate_fn=sigmoid,
+ act_layer=None, # None == use containing block's activation layer
+ reduce_mid=False,
+ divisor=1)
+
+
+def resolve_se_args(kwargs, in_chs, act_layer=None):
+ se_kwargs = kwargs.copy() if kwargs is not None else {}
+ # fill in args that aren't specified with the defaults
+ for k, v in _SE_ARGS_DEFAULT.items():
+ se_kwargs.setdefault(k, v)
+ # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
+ if not se_kwargs.pop('reduce_mid'):
+ se_kwargs['reduced_base_chs'] = in_chs
+ # act_layer override, if it remains None, the containing block's act_layer will be used
+ if se_kwargs['act_layer'] is None:
+ assert act_layer is not None
+ se_kwargs['act_layer'] = act_layer
+ return se_kwargs
+
+
+def resolve_act_layer(kwargs, default='relu'):
+ act_layer = kwargs.pop('act_layer', default)
+ if isinstance(act_layer, str):
+ act_layer = get_act_layer(act_layer)
+ return act_layer
+
+
+def make_divisible(v: int, divisor: int = 8, min_value: int = None):
+ min_value = min_value or divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ if new_v < 0.9 * v: # ensure round down does not go down by more than 10%.
+ new_v += divisor
+ return new_v
+
+
+def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
+ """Round number of filters based on depth multiplier."""
+ if not multiplier:
+ return channels
+ channels *= multiplier
+ return make_divisible(channels, divisor, channel_min)
+
+
+def drop_connect(inputs, training: bool = False, drop_connect_rate: float = 0.):
+ """Apply drop connect."""
+ if not training:
+ return inputs
+
+ keep_prob = 1 - drop_connect_rate
+ random_tensor = keep_prob + torch.rand(
+ (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
+ random_tensor.floor_() # binarize
+ output = inputs.div(keep_prob) * random_tensor
+ return output
+
+
+class SqueezeExcite(nn.Module):
+
+ def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None, act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1):
+ super(SqueezeExcite, self).__init__()
+ reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
+ self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+ self.act1 = act_layer(inplace=True)
+ self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+ self.gate_fn = gate_fn
+
+ def forward(self, x):
+ x_se = x.mean((2, 3), keepdim=True)
+ x_se = self.conv_reduce(x_se)
+ x_se = self.act1(x_se)
+ x_se = self.conv_expand(x_se)
+ x = x * self.gate_fn(x_se)
+ return x
+
+
+class ConvBnAct(nn.Module):
+ def __init__(self, in_chs, out_chs, kernel_size,
+ stride=1, pad_type='', act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+ super(ConvBnAct, self).__init__()
+ assert stride in [1, 2]
+ norm_kwargs = norm_kwargs or {}
+ self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, padding=pad_type)
+ self.bn1 = norm_layer(out_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ def forward(self, x):
+ x = self.conv(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+ return x
+
+
+class DepthwiseSeparableConv(nn.Module):
+ """ DepthwiseSeparable block
+ Used for DS convs in MobileNet-V1 and in the place of IR blocks with an expansion
+ factor of 1.0. This is an alternative to having a IR with optional first pw conv.
+ """
+ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+ pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
+ norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+ super(DepthwiseSeparableConv, self).__init__()
+ assert stride in [1, 2]
+ norm_kwargs = norm_kwargs or {}
+ self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
+ self.drop_connect_rate = drop_connect_rate
+
+ self.conv_dw = select_conv2d(
+ in_chs, in_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True)
+ self.bn1 = norm_layer(in_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ # Squeeze-and-excitation
+ if se_ratio is not None and se_ratio > 0.:
+ se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+ self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+ else:
+ self.se = nn.Identity()
+
+ self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+ self.bn2 = norm_layer(out_chs, **norm_kwargs)
+ self.act2 = act_layer(inplace=True) if pw_act else nn.Identity()
+
+ def forward(self, x):
+ residual = x
+
+ x = self.conv_dw(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ x = self.se(x)
+
+ x = self.conv_pw(x)
+ x = self.bn2(x)
+ x = self.act2(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+ return x
+
+
+class InvertedResidual(nn.Module):
+ """ Inverted residual block w/ optional SE"""
+
+ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+ exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+ se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+ conv_kwargs=None, drop_connect_rate=0.):
+ super(InvertedResidual, self).__init__()
+ norm_kwargs = norm_kwargs or {}
+ conv_kwargs = conv_kwargs or {}
+ mid_chs: int = make_divisible(in_chs * exp_ratio)
+ self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+ self.drop_connect_rate = drop_connect_rate
+
+ # Point-wise expansion
+ self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+ self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ # Depth-wise convolution
+ self.conv_dw = select_conv2d(
+ mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True, **conv_kwargs)
+ self.bn2 = norm_layer(mid_chs, **norm_kwargs)
+ self.act2 = act_layer(inplace=True)
+
+ # Squeeze-and-excitation
+ if se_ratio is not None and se_ratio > 0.:
+ se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+ self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+ else:
+ self.se = nn.Identity() # for jit.script compat
+
+ # Point-wise linear projection
+ self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+ self.bn3 = norm_layer(out_chs, **norm_kwargs)
+
+ def forward(self, x):
+ residual = x
+
+ # Point-wise expansion
+ x = self.conv_pw(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ # Depth-wise convolution
+ x = self.conv_dw(x)
+ x = self.bn2(x)
+ x = self.act2(x)
+
+ # Squeeze-and-excitation
+ x = self.se(x)
+
+ # Point-wise linear projection
+ x = self.conv_pwl(x)
+ x = self.bn3(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+ return x
+
+
+class CondConvResidual(InvertedResidual):
+ """ Inverted residual block w/ CondConv routing"""
+
+ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+ exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+ se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+ num_experts=0, drop_connect_rate=0.):
+
+ self.num_experts = num_experts
+ conv_kwargs = dict(num_experts=self.num_experts)
+
+ super(CondConvResidual, self).__init__(
+ in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, pad_type=pad_type,
+ act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
+ pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
+ norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
+ drop_connect_rate=drop_connect_rate)
+
+ self.routing_fn = nn.Linear(in_chs, self.num_experts)
+
+ def forward(self, x):
+ residual = x
+
+ # CondConv routing
+ pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
+ routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
+
+ # Point-wise expansion
+ x = self.conv_pw(x, routing_weights)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ # Depth-wise convolution
+ x = self.conv_dw(x, routing_weights)
+ x = self.bn2(x)
+ x = self.act2(x)
+
+ # Squeeze-and-excitation
+ x = self.se(x)
+
+ # Point-wise linear projection
+ x = self.conv_pwl(x, routing_weights)
+ x = self.bn3(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+ return x
+
+
+class EdgeResidual(nn.Module):
+ """ EdgeTPU Residual block with expansion convolution followed by pointwise-linear w/ stride"""
+
+ def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
+ stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
+ se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+ super(EdgeResidual, self).__init__()
+ norm_kwargs = norm_kwargs or {}
+ mid_chs = make_divisible(fake_in_chs * exp_ratio) if fake_in_chs > 0 else make_divisible(in_chs * exp_ratio)
+ self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+ self.drop_connect_rate = drop_connect_rate
+
+ # Expansion convolution
+ self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
+ self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+
+ # Squeeze-and-excitation
+ if se_ratio is not None and se_ratio > 0.:
+ se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+ self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+ else:
+ self.se = nn.Identity()
+
+ # Point-wise linear projection
+ self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, stride=stride, padding=pad_type)
+ self.bn2 = nn.BatchNorm2d(out_chs, **norm_kwargs)
+
+ def forward(self, x):
+ residual = x
+
+ # Expansion convolution
+ x = self.conv_exp(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+
+ # Squeeze-and-excitation
+ x = self.se(x)
+
+ # Point-wise linear projection
+ x = self.conv_pwl(x)
+ x = self.bn2(x)
+
+ if self.has_residual:
+ if self.drop_connect_rate > 0.:
+ x = drop_connect(x, self.training, self.drop_connect_rate)
+ x += residual
+
+ return x
+
+
+class EfficientNetBuilder:
+ """ Build Trunk Blocks for Efficient/Mobile Networks
+
+ This ended up being somewhat of a cross between
+ https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
+ and
+ https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
+
+ """
+
+ def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+ pad_type='', act_layer=None, se_kwargs=None,
+ norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+ self.channel_multiplier = channel_multiplier
+ self.channel_divisor = channel_divisor
+ self.channel_min = channel_min
+ self.pad_type = pad_type
+ self.act_layer = act_layer
+ self.se_kwargs = se_kwargs
+ self.norm_layer = norm_layer
+ self.norm_kwargs = norm_kwargs
+ self.drop_connect_rate = drop_connect_rate
+
+ # updated during build
+ self.in_chs = None
+ self.block_idx = 0
+ self.block_count = 0
+
+ def _round_channels(self, chs):
+ return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
+
+ def _make_block(self, ba):
+ bt = ba.pop('block_type')
+ ba['in_chs'] = self.in_chs
+ ba['out_chs'] = self._round_channels(ba['out_chs'])
+ if 'fake_in_chs' in ba and ba['fake_in_chs']:
+ # FIXME this is a hack to work around mismatch in origin impl input filters for EdgeTPU
+ ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
+ ba['norm_layer'] = self.norm_layer
+ ba['norm_kwargs'] = self.norm_kwargs
+ ba['pad_type'] = self.pad_type
+ # block act fn overrides the model default
+ ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+ assert ba['act_layer'] is not None
+ if bt == 'ir':
+ ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+ ba['se_kwargs'] = self.se_kwargs
+ if ba.get('num_experts', 0) > 0:
+ block = CondConvResidual(**ba)
+ else:
+ block = InvertedResidual(**ba)
+ elif bt == 'ds' or bt == 'dsa':
+ ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+ ba['se_kwargs'] = self.se_kwargs
+ block = DepthwiseSeparableConv(**ba)
+ elif bt == 'er':
+ ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+ ba['se_kwargs'] = self.se_kwargs
+ block = EdgeResidual(**ba)
+ elif bt == 'cn':
+ block = ConvBnAct(**ba)
+ else:
+ assert False, 'Uknkown block type (%s) while building model.' % bt
+ self.in_chs = ba['out_chs'] # update in_chs for arg of next block
+ return block
+
+ def _make_stack(self, stack_args):
+ blocks = []
+ # each stack (stage) contains a list of block arguments
+ for i, ba in enumerate(stack_args):
+ if i >= 1:
+ # only the first block in any stack can have a stride > 1
+ ba['stride'] = 1
+ block = self._make_block(ba)
+ blocks.append(block)
+ self.block_idx += 1 # incr global idx (across all stacks)
+ return nn.Sequential(*blocks)
+
+ def __call__(self, in_chs, block_args):
+ """ Build the blocks
+ Args:
+ in_chs: Number of input-channels passed to first block
+ block_args: A list of lists, outer list defines stages, inner
+ list contains strings defining block configuration(s)
+ Return:
+ List of block stacks (each stack wrapped in nn.Sequential)
+ """
+ self.in_chs = in_chs
+ self.block_count = sum([len(x) for x in block_args])
+ self.block_idx = 0
+ blocks = []
+ # outer list of block_args defines the stacks ('stages' by some conventions)
+ for stack_idx, stack in enumerate(block_args):
+ assert isinstance(stack, list)
+ stack = self._make_stack(stack)
+ blocks.append(stack)
+ return blocks
+
+
+def _parse_ksize(ss):
+ if ss.isdigit():
+ return int(ss)
+ else:
+ return [int(k) for k in ss.split('.')]
+
+
+def _decode_block_str(block_str):
+ """ Decode block definition string
+
+ Gets a list of block arg (dicts) through a string notation of arguments.
+ E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
+
+ All args can exist in any order with the exception of the leading string which
+ is assumed to indicate the block type.
+
+ leading string - block type (
+ ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
+ r - number of repeat blocks,
+ k - kernel size,
+ s - strides (1-9),
+ e - expansion ratio,
+ c - output channels,
+ se - squeeze/excitation ratio
+ n - activation fn ('re', 'r6', 'hs', or 'sw')
+ Args:
+ block_str: a string representation of block arguments.
+ Returns:
+ A list of block args (dicts)
+ Raises:
+ ValueError: if the string def not properly specified (TODO)
+ """
+ assert isinstance(block_str, str)
+ ops = block_str.split('_')
+ block_type = ops[0] # take the block type off the front
+ ops = ops[1:]
+ options = {}
+ noskip = False
+ for op in ops:
+ # string options being checked on individual basis, combine if they grow
+ if op == 'noskip':
+ noskip = True
+ elif op.startswith('n'):
+ # activation fn
+ key = op[0]
+ v = op[1:]
+ if v == 're':
+ value = get_act_layer('relu')
+ elif v == 'r6':
+ value = get_act_layer('relu6')
+ elif v == 'hs':
+ value = get_act_layer('hard_swish')
+ elif v == 'sw':
+ value = get_act_layer('swish')
+ else:
+ continue
+ options[key] = value
+ else:
+ # all numeric options
+ splits = re.split(r'(\d.*)', op)
+ if len(splits) >= 2:
+ key, value = splits[:2]
+ options[key] = value
+
+ # if act_layer is None, the model default (passed to model init) will be used
+ act_layer = options['n'] if 'n' in options else None
+ exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
+ pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
+ fake_in_chs = int(options['fc']) if 'fc' in options else 0 # FIXME hack to deal with in_chs issue in TPU def
+
+ num_repeat = int(options['r'])
+ # each type of block has different valid arguments, fill accordingly
+ if block_type == 'ir':
+ block_args = dict(
+ block_type=block_type,
+ dw_kernel_size=_parse_ksize(options['k']),
+ exp_kernel_size=exp_kernel_size,
+ pw_kernel_size=pw_kernel_size,
+ out_chs=int(options['c']),
+ exp_ratio=float(options['e']),
+ se_ratio=float(options['se']) if 'se' in options else None,
+ stride=int(options['s']),
+ act_layer=act_layer,
+ noskip=noskip,
+ )
+ if 'cc' in options:
+ block_args['num_experts'] = int(options['cc'])
+ elif block_type == 'ds' or block_type == 'dsa':
+ block_args = dict(
+ block_type=block_type,
+ dw_kernel_size=_parse_ksize(options['k']),
+ pw_kernel_size=pw_kernel_size,
+ out_chs=int(options['c']),
+ se_ratio=float(options['se']) if 'se' in options else None,
+ stride=int(options['s']),
+ act_layer=act_layer,
+ pw_act=block_type == 'dsa',
+ noskip=block_type == 'dsa' or noskip,
+ )
+ elif block_type == 'er':
+ block_args = dict(
+ block_type=block_type,
+ exp_kernel_size=_parse_ksize(options['k']),
+ pw_kernel_size=pw_kernel_size,
+ out_chs=int(options['c']),
+ exp_ratio=float(options['e']),
+ fake_in_chs=fake_in_chs,
+ se_ratio=float(options['se']) if 'se' in options else None,
+ stride=int(options['s']),
+ act_layer=act_layer,
+ noskip=noskip,
+ )
+ elif block_type == 'cn':
+ block_args = dict(
+ block_type=block_type,
+ kernel_size=int(options['k']),
+ out_chs=int(options['c']),
+ stride=int(options['s']),
+ act_layer=act_layer,
+ )
+ else:
+ assert False, 'Unknown block type (%s)' % block_type
+
+ return block_args, num_repeat
+
+
+def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
+ """ Per-stage depth scaling
+ Scales the block repeats in each stage. This depth scaling impl maintains
+ compatibility with the EfficientNet scaling method, while allowing sensible
+ scaling for other models that may have multiple block arg definitions in each stage.
+ """
+
+ # We scale the total repeat count for each stage, there may be multiple
+ # block arg defs per stage so we need to sum.
+ num_repeat = sum(repeats)
+ if depth_trunc == 'round':
+ # Truncating to int by rounding allows stages with few repeats to remain
+ # proportionally smaller for longer. This is a good choice when stage definitions
+ # include single repeat stages that we'd prefer to keep that way as long as possible
+ num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
+ else:
+ # The default for EfficientNet truncates repeats to int via 'ceil'.
+ # Any multiplier > 1.0 will result in an increased depth for every stage.
+ num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
+
+ # Proportionally distribute repeat count scaling to each block definition in the stage.
+ # Allocation is done in reverse as it results in the first block being less likely to be scaled.
+ # The first block makes less sense to repeat in most of the arch definitions.
+ repeats_scaled = []
+ for r in repeats[::-1]:
+ rs = max(1, round((r / num_repeat * num_repeat_scaled)))
+ repeats_scaled.append(rs)
+ num_repeat -= r
+ num_repeat_scaled -= rs
+ repeats_scaled = repeats_scaled[::-1]
+
+ # Apply the calculated scaling to each block arg in the stage
+ sa_scaled = []
+ for ba, rep in zip(stack_args, repeats_scaled):
+ sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
+ return sa_scaled
+
+
+def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False):
+ arch_args = []
+ for stack_idx, block_strings in enumerate(arch_def):
+ assert isinstance(block_strings, list)
+ stack_args = []
+ repeats = []
+ for block_str in block_strings:
+ assert isinstance(block_str, str)
+ ba, rep = _decode_block_str(block_str)
+ if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
+ ba['num_experts'] *= experts_multiplier
+ stack_args.append(ba)
+ repeats.append(rep)
+ if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
+ arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc))
+ else:
+ arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
+ return arch_args
+
+
+def initialize_weight_goog(m, n='', fix_group_fanout=True):
+ # weight init as per Tensorflow Official impl
+ # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+ if isinstance(m, CondConv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ if fix_group_fanout:
+ fan_out //= m.groups
+ init_weight_fn = get_condconv_initializer(
+ lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
+ init_weight_fn(m.weight)
+ if m.bias is not None:
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Conv2d):
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ if fix_group_fanout:
+ fan_out //= m.groups
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+ if m.bias is not None:
+ m.bias.data.zero_()
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1.0)
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Linear):
+ fan_out = m.weight.size(0) # fan-out
+ fan_in = 0
+ if 'routing_fn' in n:
+ fan_in = m.weight.size(1)
+ init_range = 1.0 / math.sqrt(fan_in + fan_out)
+ m.weight.data.uniform_(-init_range, init_range)
+ m.bias.data.zero_()
+
+
+def initialize_weight_default(m, n=''):
+ if isinstance(m, CondConv2d):
+ init_fn = get_condconv_initializer(partial(
+ nn.init.kaiming_normal_, mode='fan_out', nonlinearity='relu'), m.num_experts, m.weight_shape)
+ init_fn(m.weight)
+ elif isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1.0)
+ m.bias.data.zero_()
+ elif isinstance(m, nn.Linear):
+ nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/gen_efficientnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/gen_efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0c7c810e3064fc06d61fdd710f30058c216467f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/gen_efficientnet.py
@@ -0,0 +1,1450 @@
+""" Generic Efficient Networks
+
+A generic MobileNet class with building blocks to support a variety of models:
+
+* EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent ports)
+ - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946
+ - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971
+ - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665
+ - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252
+
+* EfficientNet-Lite
+
+* MixNet (Small, Medium, and Large)
+ - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595
+
+* MNasNet B1, A1 (SE), Small
+ - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626
+
+* FBNet-C
+ - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443
+
+* Single-Path NAS Pixel1
+ - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877
+
+* And likely more...
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import layer_config_kwargs, is_scriptable
+from .conv2d_layers import select_conv2d
+from .helpers import load_pretrained
+from .efficientnet_builder import *
+
+__all__ = ['GenEfficientNet', 'mnasnet_050', 'mnasnet_075', 'mnasnet_100', 'mnasnet_b1', 'mnasnet_140',
+ 'semnasnet_050', 'semnasnet_075', 'semnasnet_100', 'mnasnet_a1', 'semnasnet_140', 'mnasnet_small',
+ 'mobilenetv2_100', 'mobilenetv2_140', 'mobilenetv2_110d', 'mobilenetv2_120d',
+ 'fbnetc_100', 'spnasnet_100', 'efficientnet_b0', 'efficientnet_b1', 'efficientnet_b2', 'efficientnet_b3',
+ 'efficientnet_b4', 'efficientnet_b5', 'efficientnet_b6', 'efficientnet_b7', 'efficientnet_b8',
+ 'efficientnet_l2', 'efficientnet_es', 'efficientnet_em', 'efficientnet_el',
+ 'efficientnet_cc_b0_4e', 'efficientnet_cc_b0_8e', 'efficientnet_cc_b1_8e',
+ 'efficientnet_lite0', 'efficientnet_lite1', 'efficientnet_lite2', 'efficientnet_lite3', 'efficientnet_lite4',
+ 'tf_efficientnet_b0', 'tf_efficientnet_b1', 'tf_efficientnet_b2', 'tf_efficientnet_b3',
+ 'tf_efficientnet_b4', 'tf_efficientnet_b5', 'tf_efficientnet_b6', 'tf_efficientnet_b7', 'tf_efficientnet_b8',
+ 'tf_efficientnet_b0_ap', 'tf_efficientnet_b1_ap', 'tf_efficientnet_b2_ap', 'tf_efficientnet_b3_ap',
+ 'tf_efficientnet_b4_ap', 'tf_efficientnet_b5_ap', 'tf_efficientnet_b6_ap', 'tf_efficientnet_b7_ap',
+ 'tf_efficientnet_b8_ap', 'tf_efficientnet_b0_ns', 'tf_efficientnet_b1_ns', 'tf_efficientnet_b2_ns',
+ 'tf_efficientnet_b3_ns', 'tf_efficientnet_b4_ns', 'tf_efficientnet_b5_ns', 'tf_efficientnet_b6_ns',
+ 'tf_efficientnet_b7_ns', 'tf_efficientnet_l2_ns', 'tf_efficientnet_l2_ns_475',
+ 'tf_efficientnet_es', 'tf_efficientnet_em', 'tf_efficientnet_el',
+ 'tf_efficientnet_cc_b0_4e', 'tf_efficientnet_cc_b0_8e', 'tf_efficientnet_cc_b1_8e',
+ 'tf_efficientnet_lite0', 'tf_efficientnet_lite1', 'tf_efficientnet_lite2', 'tf_efficientnet_lite3',
+ 'tf_efficientnet_lite4',
+ 'mixnet_s', 'mixnet_m', 'mixnet_l', 'mixnet_xl', 'tf_mixnet_s', 'tf_mixnet_m', 'tf_mixnet_l']
+
+
+model_urls = {
+ 'mnasnet_050': None,
+ 'mnasnet_075': None,
+ 'mnasnet_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth',
+ 'mnasnet_140': None,
+ 'mnasnet_small': None,
+
+ 'semnasnet_050': None,
+ 'semnasnet_075': None,
+ 'semnasnet_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth',
+ 'semnasnet_140': None,
+
+ 'mobilenetv2_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth',
+ 'mobilenetv2_110d':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth',
+ 'mobilenetv2_120d':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth',
+ 'mobilenetv2_140':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth',
+
+ 'fbnetc_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth',
+ 'spnasnet_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth',
+
+ 'efficientnet_b0':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth',
+ 'efficientnet_b1':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth',
+ 'efficientnet_b2':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
+ 'efficientnet_b3':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
+ 'efficientnet_b4': None,
+ 'efficientnet_b5': None,
+ 'efficientnet_b6': None,
+ 'efficientnet_b7': None,
+ 'efficientnet_b8': None,
+ 'efficientnet_l2': None,
+
+ 'efficientnet_es':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth',
+ 'efficientnet_em': None,
+ 'efficientnet_el': None,
+
+ 'efficientnet_cc_b0_4e': None,
+ 'efficientnet_cc_b0_8e': None,
+ 'efficientnet_cc_b1_8e': None,
+
+ 'efficientnet_lite0': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth',
+ 'efficientnet_lite1': None,
+ 'efficientnet_lite2': None,
+ 'efficientnet_lite3': None,
+ 'efficientnet_lite4': None,
+
+ 'tf_efficientnet_b0':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth',
+ 'tf_efficientnet_b1':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth',
+ 'tf_efficientnet_b2':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth',
+ 'tf_efficientnet_b3':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth',
+ 'tf_efficientnet_b4':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth',
+ 'tf_efficientnet_b5':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth',
+ 'tf_efficientnet_b6':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth',
+ 'tf_efficientnet_b7':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
+ 'tf_efficientnet_b8':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth',
+
+ 'tf_efficientnet_b0_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth',
+ 'tf_efficientnet_b1_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth',
+ 'tf_efficientnet_b2_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth',
+ 'tf_efficientnet_b3_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth',
+ 'tf_efficientnet_b4_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth',
+ 'tf_efficientnet_b5_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth',
+ 'tf_efficientnet_b6_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth',
+ 'tf_efficientnet_b7_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth',
+ 'tf_efficientnet_b8_ap':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth',
+
+ 'tf_efficientnet_b0_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth',
+ 'tf_efficientnet_b1_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth',
+ 'tf_efficientnet_b2_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth',
+ 'tf_efficientnet_b3_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth',
+ 'tf_efficientnet_b4_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth',
+ 'tf_efficientnet_b5_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth',
+ 'tf_efficientnet_b6_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth',
+ 'tf_efficientnet_b7_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth',
+ 'tf_efficientnet_l2_ns_475':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth',
+ 'tf_efficientnet_l2_ns':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
+
+ 'tf_efficientnet_es':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
+ 'tf_efficientnet_em':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth',
+ 'tf_efficientnet_el':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth',
+
+ 'tf_efficientnet_cc_b0_4e':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth',
+ 'tf_efficientnet_cc_b0_8e':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth',
+ 'tf_efficientnet_cc_b1_8e':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth',
+
+ 'tf_efficientnet_lite0':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth',
+ 'tf_efficientnet_lite1':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth',
+ 'tf_efficientnet_lite2':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth',
+ 'tf_efficientnet_lite3':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth',
+ 'tf_efficientnet_lite4':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth',
+
+ 'mixnet_s': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth',
+ 'mixnet_m': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth',
+ 'mixnet_l': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth',
+ 'mixnet_xl': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth',
+
+ 'tf_mixnet_s':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth',
+ 'tf_mixnet_m':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth',
+ 'tf_mixnet_l':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth',
+}
+
+
+class GenEfficientNet(nn.Module):
+ """ Generic EfficientNets
+
+ An implementation of mobile optimized networks that covers:
+ * EfficientNet (B0-B8, L2, CondConv, EdgeTPU)
+ * MixNet (Small, Medium, and Large, XL)
+ * MNASNet A1, B1, and small
+ * FBNet C
+ * Single-Path NAS Pixel1
+ """
+
+ def __init__(self, block_args, num_classes=1000, in_chans=3, num_features=1280, stem_size=32, fix_stem=False,
+ channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+ pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+ se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+ weight_init='goog'):
+ super(GenEfficientNet, self).__init__()
+ self.drop_rate = drop_rate
+
+ if not fix_stem:
+ stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+ self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+ self.bn1 = norm_layer(stem_size, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+ in_chs = stem_size
+
+ builder = EfficientNetBuilder(
+ channel_multiplier, channel_divisor, channel_min,
+ pad_type, act_layer, se_kwargs, norm_layer, norm_kwargs, drop_connect_rate)
+ self.blocks = nn.Sequential(*builder(in_chs, block_args))
+ in_chs = builder.in_chs
+
+ self.conv_head = select_conv2d(in_chs, num_features, 1, padding=pad_type)
+ self.bn2 = norm_layer(num_features, **norm_kwargs)
+ self.act2 = act_layer(inplace=True)
+ self.global_pool = nn.AdaptiveAvgPool2d(1)
+ self.classifier = nn.Linear(num_features, num_classes)
+
+ for n, m in self.named_modules():
+ if weight_init == 'goog':
+ initialize_weight_goog(m, n)
+ else:
+ initialize_weight_default(m, n)
+
+ def features(self, x):
+ x = self.conv_stem(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+ x = self.blocks(x)
+ x = self.conv_head(x)
+ x = self.bn2(x)
+ x = self.act2(x)
+ return x
+
+ def as_sequential(self):
+ layers = [self.conv_stem, self.bn1, self.act1]
+ layers.extend(self.blocks)
+ layers.extend([
+ self.conv_head, self.bn2, self.act2,
+ self.global_pool, nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ x = self.features(x)
+ x = self.global_pool(x)
+ x = x.flatten(1)
+ if self.drop_rate > 0.:
+ x = F.dropout(x, p=self.drop_rate, training=self.training)
+ return self.classifier(x)
+
+
+def _create_model(model_kwargs, variant, pretrained=False):
+ as_sequential = model_kwargs.pop('as_sequential', False)
+ model = GenEfficientNet(**model_kwargs)
+ if pretrained:
+ load_pretrained(model, model_urls[variant])
+ if as_sequential:
+ model = model.as_sequential()
+ return model
+
+
+def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a mnasnet-a1 model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+ Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16_noskip'],
+ # stage 1, 112x112 in
+ ['ir_r2_k3_s2_e6_c24'],
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40_se0.25'],
+ # stage 3, 28x28 in
+ ['ir_r4_k3_s2_e6_c80'],
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112_se0.25'],
+ # stage 5, 14x14in
+ ['ir_r3_k5_s2_e6_c160_se0.25'],
+ # stage 6, 7x7 in
+ ['ir_r1_k3_s1_e6_c320'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a mnasnet-b1 model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+ Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_c16_noskip'],
+ # stage 1, 112x112 in
+ ['ir_r3_k3_s2_e3_c24'],
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40'],
+ # stage 3, 28x28 in
+ ['ir_r3_k5_s2_e6_c80'],
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c96'],
+ # stage 5, 14x14in
+ ['ir_r4_k5_s2_e6_c192'],
+ # stage 6, 7x7 in
+ ['ir_r1_k3_s1_e6_c320_noskip']
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a mnasnet-b1 model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+ Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_c8'],
+ ['ir_r1_k3_s2_e3_c16'],
+ ['ir_r2_k3_s2_e6_c16'],
+ ['ir_r4_k5_s2_e6_c32_se0.25'],
+ ['ir_r3_k3_s1_e6_c32_se0.25'],
+ ['ir_r3_k5_s2_e6_c88_se0.25'],
+ ['ir_r1_k3_s1_e6_c144']
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=8,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mobilenet_v2(
+ variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs):
+ """ Generate MobileNet-V2 network
+ Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
+ Paper: https://arxiv.org/abs/1801.04381
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_c16'],
+ ['ir_r2_k3_s2_e6_c24'],
+ ['ir_r3_k3_s2_e6_c32'],
+ ['ir_r4_k3_s2_e6_c64'],
+ ['ir_r3_k3_s1_e6_c96'],
+ ['ir_r3_k3_s2_e6_c160'],
+ ['ir_r1_k3_s1_e6_c320'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head),
+ num_features=1280 if fix_stem_head else round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ fix_stem=fix_stem_head,
+ channel_multiplier=channel_multiplier,
+ norm_kwargs=resolve_bn_args(kwargs),
+ act_layer=nn.ReLU6,
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """ FBNet-C
+
+ Paper: https://arxiv.org/abs/1812.03443
+ Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
+
+ NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper,
+ it was used to confirm some building block details
+ """
+ arch_def = [
+ ['ir_r1_k3_s1_e1_c16'],
+ ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'],
+ ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'],
+ ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'],
+ ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'],
+ ['ir_r4_k5_s2_e6_c184'],
+ ['ir_r1_k3_s1_e6_c352'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=16,
+ num_features=1984, # paper suggests this, but is not 100% clear
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
+
+ Paper: https://arxiv.org/abs/1904.02877
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_c16_noskip'],
+ # stage 1, 112x112 in
+ ['ir_r3_k3_s2_e3_c24'],
+ # stage 2, 56x56 in
+ ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'],
+ # stage 3, 28x28 in
+ ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'],
+ # stage 4, 14x14in
+ ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'],
+ # stage 5, 14x14in
+ ['ir_r4_k5_s2_e6_c192'],
+ # stage 6, 7x7 in
+ ['ir_r1_k3_s1_e6_c320_noskip']
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates an EfficientNet model.
+
+ Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+ Paper: https://arxiv.org/abs/1905.11946
+
+ EfficientNet params
+ name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+ 'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+ 'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+ 'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+ 'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+ 'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+ 'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+ 'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+ 'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+ 'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer
+ depth_multiplier: multiplier to number of repeats per stage
+
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_e1_c16_se0.25'],
+ ['ir_r2_k3_s2_e6_c24_se0.25'],
+ ['ir_r2_k5_s2_e6_c40_se0.25'],
+ ['ir_r3_k3_s2_e6_c80_se0.25'],
+ ['ir_r3_k5_s1_e6_c112_se0.25'],
+ ['ir_r4_k5_s2_e6_c192_se0.25'],
+ ['ir_r1_k3_s1_e6_c320_se0.25'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier),
+ num_features=round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'swish'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ arch_def = [
+ # NOTE `fc` is present to override a mismatch between stem channels and in chs not
+ # present in other models
+ ['er_r1_k3_s1_e4_c24_fc24_noskip'],
+ ['er_r2_k3_s2_e8_c32'],
+ ['er_r4_k3_s2_e8_c48'],
+ ['ir_r5_k5_s2_e8_c96'],
+ ['ir_r4_k5_s1_e8_c144'],
+ ['ir_r2_k5_s2_e8_c192'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier),
+ num_features=round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet_condconv(
+ variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+ """Creates an efficientnet-condconv model."""
+ arch_def = [
+ ['ds_r1_k3_s1_e1_c16_se0.25'],
+ ['ir_r2_k3_s2_e6_c24_se0.25'],
+ ['ir_r2_k5_s2_e6_c40_se0.25'],
+ ['ir_r3_k3_s2_e6_c80_se0.25'],
+ ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+ ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+ ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+ num_features=round_channels(1280, channel_multiplier, 8, None),
+ stem_size=32,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'swish'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates an EfficientNet-Lite model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite
+ Paper: https://arxiv.org/abs/1905.11946
+
+ EfficientNet params
+ name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+ 'efficientnet-lite0': (1.0, 1.0, 224, 0.2),
+ 'efficientnet-lite1': (1.0, 1.1, 240, 0.2),
+ 'efficientnet-lite2': (1.1, 1.2, 260, 0.3),
+ 'efficientnet-lite3': (1.2, 1.4, 280, 0.3),
+ 'efficientnet-lite4': (1.4, 1.8, 300, 0.3),
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer
+ depth_multiplier: multiplier to number of repeats per stage
+ """
+ arch_def = [
+ ['ds_r1_k3_s1_e1_c16'],
+ ['ir_r2_k3_s2_e6_c24'],
+ ['ir_r2_k5_s2_e6_c40'],
+ ['ir_r3_k3_s2_e6_c80'],
+ ['ir_r3_k5_s1_e6_c112'],
+ ['ir_r4_k5_s2_e6_c192'],
+ ['ir_r1_k3_s1_e6_c320'],
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True),
+ num_features=1280,
+ stem_size=32,
+ fix_stem=True,
+ channel_multiplier=channel_multiplier,
+ act_layer=nn.ReLU6,
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MixNet Small model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+ Paper: https://arxiv.org/abs/1907.09595
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'], # relu
+ # stage 2, 56x56 in
+ ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish
+ # stage 3, 28x28 in
+ ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'], # swish
+ # stage 4, 14x14in
+ ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish
+ # stage 5, 14x14in
+ ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish
+ # 7x7
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ num_features=1536,
+ stem_size=16,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MixNet Medium-Large model.
+
+ Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+ Paper: https://arxiv.org/abs/1907.09595
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c24'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'], # relu
+ # stage 2, 56x56 in
+ ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish
+ # stage 3, 28x28 in
+ ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'], # swish
+ # stage 4, 14x14in
+ ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish
+ # stage 5, 14x14in
+ ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish
+ # 7x7
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+ num_features=1536,
+ stem_size=24,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'relu'),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def mnasnet_050(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 0.5. """
+ model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_075(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 0.75. """
+ model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_100(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 1.0. """
+ model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_b1(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 1.0. """
+ return mnasnet_100(pretrained, **kwargs)
+
+
+def mnasnet_140(pretrained=False, **kwargs):
+ """ MNASNet B1, depth multiplier of 1.4 """
+ model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def semnasnet_050(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """
+ model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+ return model
+
+
+def semnasnet_075(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 0.75. """
+ model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def semnasnet_100(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+ model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_a1(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+ return semnasnet_100(pretrained, **kwargs)
+
+
+def semnasnet_140(pretrained=False, **kwargs):
+ """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """
+ model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mnasnet_small(pretrained=False, **kwargs):
+ """ MNASNet Small, depth multiplier of 1.0. """
+ model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_100(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.0 channel multiplier """
+ model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_140(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.4 channel multiplier """
+ model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_110d(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers"""
+ model = _gen_mobilenet_v2(
+ 'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv2_120d(pretrained=False, **kwargs):
+ """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """
+ model = _gen_mobilenet_v2(
+ 'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs)
+ return model
+
+
+def fbnetc_100(pretrained=False, **kwargs):
+ """ FBNet-C """
+ if pretrained:
+ # pretrained model trained with non-default BN epsilon
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def spnasnet_100(pretrained=False, **kwargs):
+ """ Single-Path NAS Pixel1"""
+ model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b0(pretrained=False, **kwargs):
+ """ EfficientNet-B0 """
+ # NOTE for train set drop_rate=0.2, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b1(pretrained=False, **kwargs):
+ """ EfficientNet-B1 """
+ # NOTE for train set drop_rate=0.2, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b2(pretrained=False, **kwargs):
+ """ EfficientNet-B2 """
+ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b3(pretrained=False, **kwargs):
+ """ EfficientNet-B3 """
+ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b4(pretrained=False, **kwargs):
+ """ EfficientNet-B4 """
+ # NOTE for train set drop_rate=0.4, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b5(pretrained=False, **kwargs):
+ """ EfficientNet-B5 """
+ # NOTE for train set drop_rate=0.4, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b6(pretrained=False, **kwargs):
+ """ EfficientNet-B6 """
+ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b7(pretrained=False, **kwargs):
+ """ EfficientNet-B7 """
+ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_b8(pretrained=False, **kwargs):
+ """ EfficientNet-B8 """
+ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2
+ model = _gen_efficientnet(
+ 'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_l2(pretrained=False, **kwargs):
+ """ EfficientNet-L2. """
+ # NOTE for train, drop_rate should be 0.5
+ model = _gen_efficientnet(
+ 'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_es(pretrained=False, **kwargs):
+ """ EfficientNet-Edge Small. """
+ model = _gen_efficientnet_edge(
+ 'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_em(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Medium. """
+ model = _gen_efficientnet_edge(
+ 'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_el(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Large. """
+ model = _gen_efficientnet_edge(
+ 'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 8 Experts """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_efficientnet_condconv(
+ 'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 8 Experts """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_efficientnet_condconv(
+ 'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B1 w/ 8 Experts """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_efficientnet_condconv(
+ 'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite0(pretrained=False, **kwargs):
+ """ EfficientNet-Lite0 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite1(pretrained=False, **kwargs):
+ """ EfficientNet-Lite1 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite2(pretrained=False, **kwargs):
+ """ EfficientNet-Lite2 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite3(pretrained=False, **kwargs):
+ """ EfficientNet-Lite3 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def efficientnet_lite4(pretrained=False, **kwargs):
+ """ EfficientNet-Lite4 """
+ model = _gen_efficientnet_lite(
+ 'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b0(pretrained=False, **kwargs):
+ """ EfficientNet-B0 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b1(pretrained=False, **kwargs):
+ """ EfficientNet-B1 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b2(pretrained=False, **kwargs):
+ """ EfficientNet-B2 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b3(pretrained=False, **kwargs):
+ """ EfficientNet-B3 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b4(pretrained=False, **kwargs):
+ """ EfficientNet-B4 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b5(pretrained=False, **kwargs):
+ """ EfficientNet-B5 RandAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b6(pretrained=False, **kwargs):
+ """ EfficientNet-B6 AutoAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b7(pretrained=False, **kwargs):
+ """ EfficientNet-B7 RandAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b8(pretrained=False, **kwargs):
+ """ EfficientNet-B8 RandAug. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b0_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B0 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b1_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B1 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b2_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B2 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b3_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B3 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b4_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B4 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b5_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B5 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b6_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B6 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b7_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B7 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b8_ap(pretrained=False, **kwargs):
+ """ EfficientNet-B8 AdvProp. Tensorflow compatible variant
+ Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b0_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b1_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b2_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b3_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b4_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b5_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b6_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_b7_ns(pretrained=False, **kwargs):
+ """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_l2_ns_475(pretrained=False, **kwargs):
+ """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_l2_ns(pretrained=False, **kwargs):
+ """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant
+ Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252)
+ """
+ # NOTE for train, drop_rate should be 0.5
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet(
+ 'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_es(pretrained=False, **kwargs):
+ """ EfficientNet-Edge Small. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_edge(
+ 'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_em(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Medium. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_edge(
+ 'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_el(pretrained=False, **kwargs):
+ """ EfficientNet-Edge-Large. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_edge(
+ 'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 4 Experts """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_condconv(
+ 'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B0 w/ 8 Experts """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_condconv(
+ 'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+ """ EfficientNet-CondConv-B1 w/ 8 Experts """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_condconv(
+ 'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+ pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite0(pretrained=False, **kwargs):
+ """ EfficientNet-Lite0. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite1(pretrained=False, **kwargs):
+ """ EfficientNet-Lite1. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite2(pretrained=False, **kwargs):
+ """ EfficientNet-Lite2. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite3(pretrained=False, **kwargs):
+ """ EfficientNet-Lite3. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_efficientnet_lite4(pretrained=False, **kwargs):
+ """ EfficientNet-Lite4. Tensorflow compatible variant """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_efficientnet_lite(
+ 'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_s(pretrained=False, **kwargs):
+ """Creates a MixNet Small model.
+ """
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mixnet_s(
+ 'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_m(pretrained=False, **kwargs):
+ """Creates a MixNet Medium model.
+ """
+ # NOTE for train set drop_rate=0.25
+ model = _gen_mixnet_m(
+ 'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_l(pretrained=False, **kwargs):
+ """Creates a MixNet Large model.
+ """
+ # NOTE for train set drop_rate=0.25
+ model = _gen_mixnet_m(
+ 'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_xl(pretrained=False, **kwargs):
+ """Creates a MixNet Extra-Large model.
+ Not a paper spec, experimental def by RW w/ depth scaling.
+ """
+ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2
+ model = _gen_mixnet_m(
+ 'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mixnet_xxl(pretrained=False, **kwargs):
+ """Creates a MixNet Double Extra Large model.
+ Not a paper spec, experimental def by RW w/ depth scaling.
+ """
+ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2
+ model = _gen_mixnet_m(
+ 'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mixnet_s(pretrained=False, **kwargs):
+ """Creates a MixNet Small model. Tensorflow compatible variant
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mixnet_s(
+ 'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mixnet_m(pretrained=False, **kwargs):
+ """Creates a MixNet Medium model. Tensorflow compatible variant
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mixnet_m(
+ 'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mixnet_l(pretrained=False, **kwargs):
+ """Creates a MixNet Large model. Tensorflow compatible variant
+ """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mixnet_m(
+ 'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/helpers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9981660517a8728bc1f3f931da74ef1f1edae750
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/helpers.py
@@ -0,0 +1,71 @@
+""" Checkpoint loading / state_dict helpers
+Copyright 2020 Ross Wightman
+"""
+import torch
+import os
+from collections import OrderedDict
+try:
+ from torch.hub import load_state_dict_from_url
+except ImportError:
+ from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+
+def load_checkpoint(model, checkpoint_path):
+ if checkpoint_path and os.path.isfile(checkpoint_path):
+ print("=> Loading checkpoint '{}'".format(checkpoint_path))
+ checkpoint = torch.load(checkpoint_path)
+ if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+ new_state_dict = OrderedDict()
+ for k, v in checkpoint['state_dict'].items():
+ if k.startswith('module'):
+ name = k[7:] # remove `module.`
+ else:
+ name = k
+ new_state_dict[name] = v
+ model.load_state_dict(new_state_dict)
+ else:
+ model.load_state_dict(checkpoint)
+ print("=> Loaded checkpoint '{}'".format(checkpoint_path))
+ else:
+ print("=> Error: No checkpoint found at '{}'".format(checkpoint_path))
+ raise FileNotFoundError()
+
+
+def load_pretrained(model, url, filter_fn=None, strict=True):
+ if not url:
+ print("=> Warning: Pretrained model URL is empty, using random initialization.")
+ return
+
+ state_dict = load_state_dict_from_url(url, progress=False, map_location='cpu')
+
+ input_conv = 'conv_stem'
+ classifier = 'classifier'
+ in_chans = getattr(model, input_conv).weight.shape[1]
+ num_classes = getattr(model, classifier).weight.shape[0]
+
+ input_conv_weight = input_conv + '.weight'
+ pretrained_in_chans = state_dict[input_conv_weight].shape[1]
+ if in_chans != pretrained_in_chans:
+ if in_chans == 1:
+ print('=> Converting pretrained input conv {} from {} to 1 channel'.format(
+ input_conv_weight, pretrained_in_chans))
+ conv1_weight = state_dict[input_conv_weight]
+ state_dict[input_conv_weight] = conv1_weight.sum(dim=1, keepdim=True)
+ else:
+ print('=> Discarding pretrained input conv {} since input channel count != {}'.format(
+ input_conv_weight, pretrained_in_chans))
+ del state_dict[input_conv_weight]
+ strict = False
+
+ classifier_weight = classifier + '.weight'
+ pretrained_num_classes = state_dict[classifier_weight].shape[0]
+ if num_classes != pretrained_num_classes:
+ print('=> Discarding pretrained classifier since num_classes != {}'.format(pretrained_num_classes))
+ del state_dict[classifier_weight]
+ del state_dict[classifier + '.bias']
+ strict = False
+
+ if filter_fn is not None:
+ state_dict = filter_fn(state_dict)
+
+ model.load_state_dict(state_dict, strict=strict)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/mobilenetv3.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..4027822356ee96d9f27d7fc9156f13b5374a3a88
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/mobilenetv3.py
@@ -0,0 +1,364 @@
+""" MobileNet-V3
+
+A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl.
+
+Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .activations import get_act_fn, get_act_layer, HardSwish
+from .config import layer_config_kwargs
+from .conv2d_layers import select_conv2d
+from .helpers import load_pretrained
+from .efficientnet_builder import *
+
+__all__ = ['mobilenetv3_rw', 'mobilenetv3_large_075', 'mobilenetv3_large_100', 'mobilenetv3_large_minimal_100',
+ 'mobilenetv3_small_075', 'mobilenetv3_small_100', 'mobilenetv3_small_minimal_100',
+ 'tf_mobilenetv3_large_075', 'tf_mobilenetv3_large_100', 'tf_mobilenetv3_large_minimal_100',
+ 'tf_mobilenetv3_small_075', 'tf_mobilenetv3_small_100', 'tf_mobilenetv3_small_minimal_100']
+
+model_urls = {
+ 'mobilenetv3_rw':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
+ 'mobilenetv3_large_075': None,
+ 'mobilenetv3_large_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth',
+ 'mobilenetv3_large_minimal_100': None,
+ 'mobilenetv3_small_075': None,
+ 'mobilenetv3_small_100': None,
+ 'mobilenetv3_small_minimal_100': None,
+ 'tf_mobilenetv3_large_075':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth',
+ 'tf_mobilenetv3_large_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth',
+ 'tf_mobilenetv3_large_minimal_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth',
+ 'tf_mobilenetv3_small_075':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth',
+ 'tf_mobilenetv3_small_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth',
+ 'tf_mobilenetv3_small_minimal_100':
+ 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth',
+}
+
+
+class MobileNetV3(nn.Module):
+ """ MobileNet-V3
+
+ A this model utilizes the MobileNet-v3 specific 'efficient head', where global pooling is done before the
+ head convolution without a final batch-norm layer before the classifier.
+
+ Paper: https://arxiv.org/abs/1905.02244
+ """
+
+ def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
+ channel_multiplier=1.0, pad_type='', act_layer=HardSwish, drop_rate=0., drop_connect_rate=0.,
+ se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'):
+ super(MobileNetV3, self).__init__()
+ self.drop_rate = drop_rate
+
+ stem_size = round_channels(stem_size, channel_multiplier)
+ self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+ self.bn1 = nn.BatchNorm2d(stem_size, **norm_kwargs)
+ self.act1 = act_layer(inplace=True)
+ in_chs = stem_size
+
+ builder = EfficientNetBuilder(
+ channel_multiplier, pad_type=pad_type, act_layer=act_layer, se_kwargs=se_kwargs,
+ norm_layer=norm_layer, norm_kwargs=norm_kwargs, drop_connect_rate=drop_connect_rate)
+ self.blocks = nn.Sequential(*builder(in_chs, block_args))
+ in_chs = builder.in_chs
+
+ self.global_pool = nn.AdaptiveAvgPool2d(1)
+ self.conv_head = select_conv2d(in_chs, num_features, 1, padding=pad_type, bias=head_bias)
+ self.act2 = act_layer(inplace=True)
+ self.classifier = nn.Linear(num_features, num_classes)
+
+ for m in self.modules():
+ if weight_init == 'goog':
+ initialize_weight_goog(m)
+ else:
+ initialize_weight_default(m)
+
+ def as_sequential(self):
+ layers = [self.conv_stem, self.bn1, self.act1]
+ layers.extend(self.blocks)
+ layers.extend([
+ self.global_pool, self.conv_head, self.act2,
+ nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+ return nn.Sequential(*layers)
+
+ def features(self, x):
+ x = self.conv_stem(x)
+ x = self.bn1(x)
+ x = self.act1(x)
+ x = self.blocks(x)
+ x = self.global_pool(x)
+ x = self.conv_head(x)
+ x = self.act2(x)
+ return x
+
+ def forward(self, x):
+ x = self.features(x)
+ x = x.flatten(1)
+ if self.drop_rate > 0.:
+ x = F.dropout(x, p=self.drop_rate, training=self.training)
+ return self.classifier(x)
+
+
+def _create_model(model_kwargs, variant, pretrained=False):
+ as_sequential = model_kwargs.pop('as_sequential', False)
+ model = MobileNetV3(**model_kwargs)
+ if pretrained and model_urls[variant]:
+ load_pretrained(model, model_urls[variant])
+ if as_sequential:
+ model = model.as_sequential()
+ return model
+
+
+def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MobileNet-V3 model (RW variant).
+
+ Paper: https://arxiv.org/abs/1905.02244
+
+ This was my first attempt at reproducing the MobileNet-V3 from paper alone. It came close to the
+ eventual Tensorflow reference impl but has a few differences:
+ 1. This model has no bias on the head convolution
+ 2. This model forces no residual (noskip) on the first DWS block, this is different than MnasNet
+ 3. This model always uses ReLU for the SE activation layer, other models in the family inherit their act layer
+ from their parent block
+ 4. This model does not enforce divisible by 8 limitation on the SE reduction channel count
+
+ Overall the changes are fairly minor and result in a very small parameter count difference and no
+ top-1/5
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16_nre_noskip'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu
+ # stage 3, 28x28 in
+ ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish
+ # stage 5, 14x14in
+ ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c960'], # hard-swish
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ head_bias=False, # one of my mistakes
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+ se_kwargs=dict(gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+ """Creates a MobileNet-V3 large/small/minimal models.
+
+ Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v3.py
+ Paper: https://arxiv.org/abs/1905.02244
+
+ Args:
+ channel_multiplier: multiplier to number of channels per layer.
+ """
+ if 'small' in variant:
+ num_features = 1024
+ if 'minimal' in variant:
+ act_layer = 'relu'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s2_e1_c16'],
+ # stage 1, 56x56 in
+ ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'],
+ # stage 2, 28x28 in
+ ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'],
+ # stage 3, 14x14 in
+ ['ir_r2_k3_s1_e3_c48'],
+ # stage 4, 14x14in
+ ['ir_r3_k3_s2_e6_c96'],
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c576'],
+ ]
+ else:
+ act_layer = 'hard_swish'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s2_e1_c16_se0.25_nre'], # relu
+ # stage 1, 56x56 in
+ ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'], # relu
+ # stage 2, 28x28 in
+ ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'], # hard-swish
+ # stage 3, 14x14 in
+ ['ir_r2_k5_s1_e3_c48_se0.25'], # hard-swish
+ # stage 4, 14x14in
+ ['ir_r3_k5_s2_e6_c96_se0.25'], # hard-swish
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c576'], # hard-swish
+ ]
+ else:
+ num_features = 1280
+ if 'minimal' in variant:
+ act_layer = 'relu'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16'],
+ # stage 1, 112x112 in
+ ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'],
+ # stage 2, 56x56 in
+ ['ir_r3_k3_s2_e3_c40'],
+ # stage 3, 28x28 in
+ ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112'],
+ # stage 5, 14x14in
+ ['ir_r3_k3_s2_e6_c160'],
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c960'],
+ ]
+ else:
+ act_layer = 'hard_swish'
+ arch_def = [
+ # stage 0, 112x112 in
+ ['ds_r1_k3_s1_e1_c16_nre'], # relu
+ # stage 1, 112x112 in
+ ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu
+ # stage 2, 56x56 in
+ ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu
+ # stage 3, 28x28 in
+ ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish
+ # stage 4, 14x14in
+ ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish
+ # stage 5, 14x14in
+ ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish
+ # stage 6, 7x7 in
+ ['cn_r1_k1_s1_c960'], # hard-swish
+ ]
+ with layer_config_kwargs(kwargs):
+ model_kwargs = dict(
+ block_args=decode_arch_def(arch_def),
+ num_features=num_features,
+ stem_size=16,
+ channel_multiplier=channel_multiplier,
+ act_layer=resolve_act_layer(kwargs, act_layer),
+ se_kwargs=dict(
+ act_layer=get_act_layer('relu'), gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True, divisor=8),
+ norm_kwargs=resolve_bn_args(kwargs),
+ **kwargs,
+ )
+ model = _create_model(model_kwargs, variant, pretrained)
+ return model
+
+
+def mobilenetv3_rw(pretrained=False, **kwargs):
+ """ MobileNet-V3 RW
+ Attn: See note in gen function for this variant.
+ """
+ # NOTE for train set drop_rate=0.2
+ if pretrained:
+ # pretrained model trained with non-default BN epsilon
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_large_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 0.75"""
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_large_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 1.0 """
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large (Minimalistic) 1.0 """
+ # NOTE for train set drop_rate=0.2
+ model = _gen_mobilenet_v3('mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_small_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 0.75 """
+ model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_small_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 1.0 """
+ model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small (Minimalistic) 1.0 """
+ model = _gen_mobilenet_v3('mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_large_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 0.75. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_large_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large 1.0. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Large Minimalistic 1.0. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_small_075(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 0.75. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_small_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small 1.0. Tensorflow compat variant."""
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
+
+
+def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+ """ MobileNet V3 Small Minimalistic 1.0. Tensorflow compat variant. """
+ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+ kwargs['pad_type'] = 'same'
+ model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/model_factory.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/model_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd2208dc97e9d705fda7cc497b21d630ca798ecb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/model_factory.py
@@ -0,0 +1,27 @@
+from .config import set_layer_config
+from .helpers import load_checkpoint
+
+from .gen_efficientnet import *
+from .mobilenetv3 import *
+
+
+def create_model(
+ model_name='mnasnet_100',
+ pretrained=None,
+ num_classes=1000,
+ in_chans=3,
+ checkpoint_path='',
+ **kwargs):
+
+ model_kwargs = dict(num_classes=num_classes, in_chans=in_chans, pretrained=pretrained, **kwargs)
+
+ if model_name in globals():
+ create_fn = globals()[model_name]
+ model = create_fn(**model_kwargs)
+ else:
+ raise RuntimeError('Unknown model (%s)' % model_name)
+
+ if checkpoint_path and not pretrained:
+ load_checkpoint(model, checkpoint_path)
+
+ return model
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/version.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..20fc277bae821bed80a29af8538a2a9273b20c41
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/geffnet/version.py
@@ -0,0 +1 @@
+__version__ = '1.0.2'
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/hubconf.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..89feac0f9699a8a35cf69b2fcd3628d9d110239b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/hubconf.py
@@ -0,0 +1,84 @@
+dependencies = ['torch', 'math']
+
+from geffnet import efficientnet_b0
+from geffnet import efficientnet_b1
+from geffnet import efficientnet_b2
+from geffnet import efficientnet_b3
+
+from geffnet import efficientnet_es
+
+from geffnet import efficientnet_lite0
+
+from geffnet import mixnet_s
+from geffnet import mixnet_m
+from geffnet import mixnet_l
+from geffnet import mixnet_xl
+
+from geffnet import mobilenetv2_100
+from geffnet import mobilenetv2_110d
+from geffnet import mobilenetv2_120d
+from geffnet import mobilenetv2_140
+
+from geffnet import mobilenetv3_large_100
+from geffnet import mobilenetv3_rw
+from geffnet import mnasnet_a1
+from geffnet import mnasnet_b1
+from geffnet import fbnetc_100
+from geffnet import spnasnet_100
+
+from geffnet import tf_efficientnet_b0
+from geffnet import tf_efficientnet_b1
+from geffnet import tf_efficientnet_b2
+from geffnet import tf_efficientnet_b3
+from geffnet import tf_efficientnet_b4
+from geffnet import tf_efficientnet_b5
+from geffnet import tf_efficientnet_b6
+from geffnet import tf_efficientnet_b7
+from geffnet import tf_efficientnet_b8
+
+from geffnet import tf_efficientnet_b0_ap
+from geffnet import tf_efficientnet_b1_ap
+from geffnet import tf_efficientnet_b2_ap
+from geffnet import tf_efficientnet_b3_ap
+from geffnet import tf_efficientnet_b4_ap
+from geffnet import tf_efficientnet_b5_ap
+from geffnet import tf_efficientnet_b6_ap
+from geffnet import tf_efficientnet_b7_ap
+from geffnet import tf_efficientnet_b8_ap
+
+from geffnet import tf_efficientnet_b0_ns
+from geffnet import tf_efficientnet_b1_ns
+from geffnet import tf_efficientnet_b2_ns
+from geffnet import tf_efficientnet_b3_ns
+from geffnet import tf_efficientnet_b4_ns
+from geffnet import tf_efficientnet_b5_ns
+from geffnet import tf_efficientnet_b6_ns
+from geffnet import tf_efficientnet_b7_ns
+from geffnet import tf_efficientnet_l2_ns_475
+from geffnet import tf_efficientnet_l2_ns
+
+from geffnet import tf_efficientnet_es
+from geffnet import tf_efficientnet_em
+from geffnet import tf_efficientnet_el
+
+from geffnet import tf_efficientnet_cc_b0_4e
+from geffnet import tf_efficientnet_cc_b0_8e
+from geffnet import tf_efficientnet_cc_b1_8e
+
+from geffnet import tf_efficientnet_lite0
+from geffnet import tf_efficientnet_lite1
+from geffnet import tf_efficientnet_lite2
+from geffnet import tf_efficientnet_lite3
+from geffnet import tf_efficientnet_lite4
+
+from geffnet import tf_mixnet_s
+from geffnet import tf_mixnet_m
+from geffnet import tf_mixnet_l
+
+from geffnet import tf_mobilenetv3_large_075
+from geffnet import tf_mobilenetv3_large_100
+from geffnet import tf_mobilenetv3_large_minimal_100
+from geffnet import tf_mobilenetv3_small_075
+from geffnet import tf_mobilenetv3_small_100
+from geffnet import tf_mobilenetv3_small_minimal_100
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_export.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..af3ed8993bfa41190b5066a044efbe53b45f4c04
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_export.py
@@ -0,0 +1,120 @@
+""" ONNX export script
+
+Export PyTorch models as ONNX graphs.
+
+This export script originally started as an adaptation of code snippets found at
+https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html
+
+The default parameters work with PyTorch 1.6 and ONNX 1.7 and produce an optimal ONNX graph
+for hosting in the ONNX runtime (see onnx_validate.py). To export an ONNX model compatible
+with caffe2 (see caffe2_benchmark.py and caffe2_validate.py), the --keep-init and --aten-fallback
+flags are currently required.
+
+Older versions of PyTorch/ONNX (tested PyTorch 1.4, ONNX 1.5) do not need extra flags for
+caffe2 compatibility, but they produce a model that isn't as fast running on ONNX runtime.
+
+Most new release of PyTorch and ONNX cause some sort of breakage in the export / usage of ONNX models.
+Please do your research and search ONNX and PyTorch issue tracker before asking me. Thanks.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import torch
+import numpy as np
+
+import onnx
+import geffnet
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
+parser.add_argument('output', metavar='ONNX_FILE',
+ help='output model filename')
+parser.add_argument('--model', '-m', metavar='MODEL', default='mobilenetv3_large_100',
+ help='model architecture (default: mobilenetv3_large_100)')
+parser.add_argument('--opset', type=int, default=10,
+ help='ONNX opset to use (default: 10)')
+parser.add_argument('--keep-init', action='store_true', default=False,
+ help='Keep initializers as input. Needed for Caffe2 compatible export in newer PyTorch/ONNX.')
+parser.add_argument('--aten-fallback', action='store_true', default=False,
+ help='Fallback to ATEN ops. Helps fix AdaptiveAvgPool issue with Caffe2 in newer PyTorch/ONNX.')
+parser.add_argument('--dynamic-size', action='store_true', default=False,
+ help='Export model width dynamic width/height. Not recommended for "tf" models with SAME padding.')
+parser.add_argument('-b', '--batch-size', default=1, type=int,
+ metavar='N', help='mini-batch size (default: 1)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--num-classes', type=int, default=1000,
+ help='Number classes in dataset')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+ help='path to checkpoint (default: none)')
+
+
+def main():
+ args = parser.parse_args()
+
+ args.pretrained = True
+ if args.checkpoint:
+ args.pretrained = False
+
+ print("==> Creating PyTorch {} model".format(args.model))
+ # NOTE exportable=True flag disables autofn/jit scripted activations and uses Conv2dSameExport layers
+ # for models using SAME padding
+ model = geffnet.create_model(
+ args.model,
+ num_classes=args.num_classes,
+ in_chans=3,
+ pretrained=args.pretrained,
+ checkpoint_path=args.checkpoint,
+ exportable=True)
+
+ model.eval()
+
+ example_input = torch.randn((args.batch_size, 3, args.img_size or 224, args.img_size or 224), requires_grad=True)
+
+ # Run model once before export trace, sets padding for models with Conv2dSameExport. This means
+ # that the padding for models with Conv2dSameExport (most models with tf_ prefix) is fixed for
+ # the input img_size specified in this script.
+ # Opset >= 11 should allow for dynamic padding, however I cannot get it to work due to
+ # issues in the tracing of the dynamic padding or errors attempting to export the model after jit
+ # scripting it (an approach that should work). Perhaps in a future PyTorch or ONNX versions...
+ model(example_input)
+
+ print("==> Exporting model to ONNX format at '{}'".format(args.output))
+ input_names = ["input0"]
+ output_names = ["output0"]
+ dynamic_axes = {'input0': {0: 'batch'}, 'output0': {0: 'batch'}}
+ if args.dynamic_size:
+ dynamic_axes['input0'][2] = 'height'
+ dynamic_axes['input0'][3] = 'width'
+ if args.aten_fallback:
+ export_type = torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+ else:
+ export_type = torch.onnx.OperatorExportTypes.ONNX
+
+ torch_out = torch.onnx._export(
+ model, example_input, args.output, export_params=True, verbose=True, input_names=input_names,
+ output_names=output_names, keep_initializers_as_inputs=args.keep_init, dynamic_axes=dynamic_axes,
+ opset_version=args.opset, operator_export_type=export_type)
+
+ print("==> Loading and checking exported model from '{}'".format(args.output))
+ onnx_model = onnx.load(args.output)
+ onnx.checker.check_model(onnx_model) # assuming throw on error
+ print("==> Passed")
+
+ if args.keep_init and args.aten_fallback:
+ import caffe2.python.onnx.backend as onnx_caffe2
+ # Caffe2 loading only works properly in newer PyTorch/ONNX combos when
+ # keep_initializers_as_inputs and aten_fallback are set to True.
+ print("==> Loading model into Caffe2 backend and comparing forward pass.".format(args.output))
+ caffe2_backend = onnx_caffe2.prepare(onnx_model)
+ B = {onnx_model.graph.input[0].name: x.data.numpy()}
+ c2_out = caffe2_backend.run(B)[0]
+ np.testing.assert_almost_equal(torch_out.data.numpy(), c2_out, decimal=5)
+ print("==> Passed")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_optimize.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_optimize.py
new file mode 100644
index 0000000000000000000000000000000000000000..85abc534efd08d5fb51881954ef43b8480561824
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_optimize.py
@@ -0,0 +1,84 @@
+""" ONNX optimization script
+
+Run ONNX models through the optimizer to prune unneeded nodes, fuse batchnorm layers into conv, etc.
+
+NOTE: This isn't working consistently in recent PyTorch/ONNX combos (ie PyTorch 1.6 and ONNX 1.7),
+it seems time to switch to using the onnxruntime online optimizer (can also be saved for offline).
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import warnings
+
+import onnx
+from onnx import optimizer
+
+
+parser = argparse.ArgumentParser(description="Optimize ONNX model")
+
+parser.add_argument("model", help="The ONNX model")
+parser.add_argument("--output", required=True, help="The optimized model output filename")
+
+
+def traverse_graph(graph, prefix=''):
+ content = []
+ indent = prefix + ' '
+ graphs = []
+ num_nodes = 0
+ for node in graph.node:
+ pn, gs = onnx.helper.printable_node(node, indent, subgraphs=True)
+ assert isinstance(gs, list)
+ content.append(pn)
+ graphs.extend(gs)
+ num_nodes += 1
+ for g in graphs:
+ g_count, g_str = traverse_graph(g)
+ content.append('\n' + g_str)
+ num_nodes += g_count
+ return num_nodes, '\n'.join(content)
+
+
+def main():
+ args = parser.parse_args()
+ onnx_model = onnx.load(args.model)
+ num_original_nodes, original_graph_str = traverse_graph(onnx_model.graph)
+
+ # Optimizer passes to perform
+ passes = [
+ #'eliminate_deadend',
+ 'eliminate_identity',
+ 'eliminate_nop_dropout',
+ 'eliminate_nop_pad',
+ 'eliminate_nop_transpose',
+ 'eliminate_unused_initializer',
+ 'extract_constant_to_initializer',
+ 'fuse_add_bias_into_conv',
+ 'fuse_bn_into_conv',
+ 'fuse_consecutive_concats',
+ 'fuse_consecutive_reduce_unsqueeze',
+ 'fuse_consecutive_squeezes',
+ 'fuse_consecutive_transposes',
+ #'fuse_matmul_add_bias_into_gemm',
+ 'fuse_pad_into_conv',
+ #'fuse_transpose_into_gemm',
+ #'lift_lexical_references',
+ ]
+
+ # Apply the optimization on the original serialized model
+ # WARNING I've had issues with optimizer in recent versions of PyTorch / ONNX causing
+ # 'duplicate definition of name' errors, see: https://github.com/onnx/onnx/issues/2401
+ # It may be better to rely on onnxruntime optimizations, see onnx_validate.py script.
+ warnings.warn("I've had issues with optimizer in recent versions of PyTorch / ONNX."
+ "Try onnxruntime optimization if this doesn't work.")
+ optimized_model = optimizer.optimize(onnx_model, passes)
+
+ num_optimized_nodes, optimzied_graph_str = traverse_graph(optimized_model.graph)
+ print('==> The model after optimization:\n{}\n'.format(optimzied_graph_str))
+ print('==> The optimized model has {} nodes, the original had {}.'.format(num_optimized_nodes, num_original_nodes))
+
+ # Save the ONNX model
+ onnx.save(optimized_model, args.output)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_to_caffe.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_to_caffe.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fe0b0d7624ef871be586024429d058107a6f1d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_to_caffe.py
@@ -0,0 +1,27 @@
+import argparse
+
+import onnx
+from caffe2.python.onnx.backend import Caffe2Backend
+
+
+parser = argparse.ArgumentParser(description="Convert ONNX to Caffe2")
+
+parser.add_argument("model", help="The ONNX model")
+parser.add_argument("--c2-prefix", required=True,
+ help="The output file prefix for the caffe2 model init and predict file. ")
+
+
+def main():
+ args = parser.parse_args()
+ onnx_model = onnx.load(args.model)
+ caffe2_init, caffe2_predict = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
+ caffe2_init_str = caffe2_init.SerializeToString()
+ with open(args.c2_prefix + '.init.pb', "wb") as f:
+ f.write(caffe2_init_str)
+ caffe2_predict_str = caffe2_predict.SerializeToString()
+ with open(args.c2_prefix + '.predict.pb', "wb") as f:
+ f.write(caffe2_predict_str)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_validate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3736cea66b29d20e00a114d5d82d899a7dbe6f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/onnx_validate.py
@@ -0,0 +1,112 @@
+""" ONNX-runtime validation script
+
+This script was created to verify accuracy and performance of exported ONNX
+models running with the onnxruntime. It utilizes the PyTorch dataloader/processing
+pipeline for a fair comparison against the originals.
+
+Copyright 2020 Ross Wightman
+"""
+import argparse
+import numpy as np
+import onnxruntime
+from data import create_loader, resolve_data_config, Dataset
+from utils import AverageMeter
+import time
+
+parser = argparse.ArgumentParser(description='Caffe2 ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--onnx-input', default='', type=str, metavar='PATH',
+ help='path to onnx model/weights file')
+parser.add_argument('--onnx-output-opt', default='', type=str, metavar='PATH',
+ help='path to output optimized onnx graph')
+parser.add_argument('--profile', action='store_true', default=False,
+ help='Enable profiler output.')
+parser.add_argument('-j', '--workers', default=2, type=int, metavar='N',
+ help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+ metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT',
+ help='Override default crop pct of 0.875')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true',
+ help='use tensorflow mnasnet preporcessing')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+ metavar='N', help='print frequency (default: 10)')
+
+
+def main():
+ args = parser.parse_args()
+ args.gpu_id = 0
+
+ # Set graph optimization level
+ sess_options = onnxruntime.SessionOptions()
+ sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+ if args.profile:
+ sess_options.enable_profiling = True
+ if args.onnx_output_opt:
+ sess_options.optimized_model_filepath = args.onnx_output_opt
+
+ session = onnxruntime.InferenceSession(args.onnx_input, sess_options)
+
+ data_config = resolve_data_config(None, args)
+ loader = create_loader(
+ Dataset(args.data, load_bytes=args.tf_preprocessing),
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ use_prefetcher=False,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ crop_pct=data_config['crop_pct'],
+ tensorflow_preprocessing=args.tf_preprocessing)
+
+ input_name = session.get_inputs()[0].name
+
+ batch_time = AverageMeter()
+ top1 = AverageMeter()
+ top5 = AverageMeter()
+ end = time.time()
+ for i, (input, target) in enumerate(loader):
+ # run the net and return prediction
+ output = session.run([], {input_name: input.data.numpy()})
+ output = output[0]
+
+ # measure accuracy and record loss
+ prec1, prec5 = accuracy_np(output, target.numpy())
+ top1.update(prec1.item(), input.size(0))
+ top5.update(prec5.item(), input.size(0))
+
+ # measure elapsed time
+ batch_time.update(time.time() - end)
+ end = time.time()
+
+ if i % args.print_freq == 0:
+ print('Test: [{0}/{1}]\t'
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s, {ms_avg:.3f} ms/sample) \t'
+ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+ 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+ i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg,
+ ms_avg=100 * batch_time.avg / input.size(0), top1=top1, top5=top5))
+
+ print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format(
+ top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg))
+
+
+def accuracy_np(output, target):
+ max_indices = np.argsort(output, axis=1)[:, ::-1]
+ top5 = 100 * np.equal(max_indices[:, :5], target[:, np.newaxis]).sum(axis=1).mean()
+ top1 = 100 * np.equal(max_indices[:, 0], target).mean()
+ return top1, top5
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/requirements.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a59ac4eded037190ed20c2cb66c6b8aa802b3c65
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/requirements.txt
@@ -0,0 +1,2 @@
+torch>=1.2.0
+torchvision>=0.4.0
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/setup.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d7c6276e4073b6de7f3ec43ffa01e614e14bd97
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/setup.py
@@ -0,0 +1,47 @@
+""" Setup
+"""
+from setuptools import setup, find_packages
+from codecs import open
+from os import path
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the README file
+with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+ long_description = f.read()
+
+exec(open('geffnet/version.py').read())
+setup(
+ name='geffnet',
+ version=__version__,
+ description='(Generic) EfficientNets for PyTorch',
+ long_description=long_description,
+ long_description_content_type='text/markdown',
+ url='https://github.com/rwightman/gen-efficientnet-pytorch',
+ author='Ross Wightman',
+ author_email='hello@rwightman.com',
+ classifiers=[
+ # How mature is this project? Common values are
+ # 3 - Alpha
+ # 4 - Beta
+ # 5 - Production/Stable
+ 'Development Status :: 3 - Alpha',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Science/Research',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Topic :: Scientific/Engineering',
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
+ 'Topic :: Software Development',
+ 'Topic :: Software Development :: Libraries',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ ],
+
+ # Note that this is a string of words separated by whitespace, not a list.
+ keywords='pytorch pretrained models efficientnet mixnet mobilenetv3 mnasnet',
+ packages=find_packages(exclude=['data']),
+ install_requires=['torch >= 1.4', 'torchvision'],
+ python_requires='>=3.6',
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eba7616037b08488c795563d0aa37e73a67a878
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/utils.py
@@ -0,0 +1,52 @@
+import os
+
+
+class AverageMeter:
+ """Computes and stores the average and current value"""
+ def __init__(self):
+ self.reset()
+
+ def reset(self):
+ self.val = 0
+ self.avg = 0
+ self.sum = 0
+ self.count = 0
+
+ def update(self, val, n=1):
+ self.val = val
+ self.sum += val * n
+ self.count += n
+ self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+ """Computes the precision@k for the specified values of k"""
+ maxk = max(topk)
+ batch_size = target.size(0)
+
+ _, pred = output.topk(maxk, 1, True, True)
+ pred = pred.t()
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+ res = []
+ for k in topk:
+ correct_k = correct[:k].reshape(-1).float().sum(0)
+ res.append(correct_k.mul_(100.0 / batch_size))
+ return res
+
+
+def get_outdir(path, *paths, inc=False):
+ outdir = os.path.join(path, *paths)
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+ elif inc:
+ count = 1
+ outdir_inc = outdir + '-' + str(count)
+ while os.path.exists(outdir_inc):
+ count = count + 1
+ outdir_inc = outdir + '-' + str(count)
+ assert count < 100
+ outdir = outdir_inc
+ os.makedirs(outdir)
+ return outdir
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/validate.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced583562887e458790a57c70d4e57ffa36c4955
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/efficientnet_repo/validate.py
@@ -0,0 +1,166 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+from contextlib import suppress
+
+import geffnet
+from data import Dataset, create_loader, resolve_data_config
+from utils import accuracy, AverageMeter
+
+has_native_amp = False
+try:
+ if getattr(torch.cuda.amp, 'autocast') is not None:
+ has_native_amp = True
+except AttributeError:
+ pass
+
+torch.backends.cudnn.benchmark = True
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--model', '-m', metavar='MODEL', default='spnasnet1_00',
+ help='model architecture (default: dpn92)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+ help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+ metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+ metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT',
+ help='Override default crop pct of 0.875')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('--num-classes', type=int, default=1000,
+ help='Number classes in dataset')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+ metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+ help='path to latest checkpoint (default: none)')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+ help='use pre-trained model')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+ help='convert model torchscript for inference')
+parser.add_argument('--num-gpu', type=int, default=1,
+ help='Number of GPUS to use')
+parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true',
+ help='use tensorflow mnasnet preporcessing')
+parser.add_argument('--no-cuda', dest='no_cuda', action='store_true',
+ help='')
+parser.add_argument('--channels-last', action='store_true', default=False,
+ help='Use channels_last memory layout')
+parser.add_argument('--amp', action='store_true', default=False,
+ help='Use native Torch AMP mixed precision.')
+
+
+def main():
+ args = parser.parse_args()
+
+ if not args.checkpoint and not args.pretrained:
+ args.pretrained = True
+
+ amp_autocast = suppress # do nothing
+ if args.amp:
+ if not has_native_amp:
+ print("Native Torch AMP is not available (requires torch >= 1.6), using FP32.")
+ else:
+ amp_autocast = torch.cuda.amp.autocast
+
+ # create model
+ model = geffnet.create_model(
+ args.model,
+ num_classes=args.num_classes,
+ in_chans=3,
+ pretrained=args.pretrained,
+ checkpoint_path=args.checkpoint,
+ scriptable=args.torchscript)
+
+ if args.channels_last:
+ model = model.to(memory_format=torch.channels_last)
+
+ if args.torchscript:
+ torch.jit.optimized_execution(True)
+ model = torch.jit.script(model)
+
+ print('Model %s created, param count: %d' %
+ (args.model, sum([m.numel() for m in model.parameters()])))
+
+ data_config = resolve_data_config(model, args)
+
+ criterion = nn.CrossEntropyLoss()
+
+ if not args.no_cuda:
+ if args.num_gpu > 1:
+ model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda()
+ else:
+ model = model.cuda()
+ criterion = criterion.cuda()
+
+ loader = create_loader(
+ Dataset(args.data, load_bytes=args.tf_preprocessing),
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ use_prefetcher=not args.no_cuda,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ crop_pct=data_config['crop_pct'],
+ tensorflow_preprocessing=args.tf_preprocessing)
+
+ batch_time = AverageMeter()
+ losses = AverageMeter()
+ top1 = AverageMeter()
+ top5 = AverageMeter()
+
+ model.eval()
+ end = time.time()
+ with torch.no_grad():
+ for i, (input, target) in enumerate(loader):
+ if not args.no_cuda:
+ target = target.cuda()
+ input = input.cuda()
+ if args.channels_last:
+ input = input.contiguous(memory_format=torch.channels_last)
+
+ # compute output
+ with amp_autocast():
+ output = model(input)
+ loss = criterion(output, target)
+
+ # measure accuracy and record loss
+ prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+ losses.update(loss.item(), input.size(0))
+ top1.update(prec1.item(), input.size(0))
+ top5.update(prec5.item(), input.size(0))
+
+ # measure elapsed time
+ batch_time.update(time.time() - end)
+ end = time.time()
+
+ if i % args.print_freq == 0:
+ print('Test: [{0}/{1}]\t'
+ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t'
+ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+ 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+ i, len(loader), batch_time=batch_time,
+ rate_avg=input.size(0) / batch_time.avg,
+ loss=losses, top1=top1, top5=top5))
+
+ print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format(
+ top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/encoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..581d8a4632da4ab2c1132fb4a1c8d8fbd1248b5b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/encoder.py
@@ -0,0 +1,34 @@
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Encoder(nn.Module):
+ def __init__(self):
+ super(Encoder, self).__init__()
+
+ basemodel_name = 'tf_efficientnet_b5_ap'
+ print('Loading base model ()...'.format(basemodel_name), end='')
+ repo_path = os.path.join(os.path.dirname(__file__), 'efficientnet_repo')
+ basemodel = torch.hub.load(repo_path, basemodel_name, pretrained=False, source='local')
+ print('Done.')
+
+ # Remove last layer
+ print('Removing last two layers (global_pool & classifier).')
+ basemodel.global_pool = nn.Identity()
+ basemodel.classifier = nn.Identity()
+
+ self.original_model = basemodel
+
+ def forward(self, x):
+ features = [x]
+ for k, v in self.original_model._modules.items():
+ if (k == 'blocks'):
+ for ki, vi in v._modules.items():
+ features.append(vi(features[-1]))
+ else:
+ features.append(v(features[-1]))
+ return features
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/submodules.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/submodules.py
new file mode 100644
index 0000000000000000000000000000000000000000..97bc08078bc2b4ab7b560eed99e4c942375b8459
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/normalbae/nets/submodules/submodules.py
@@ -0,0 +1,140 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+########################################################################################################################
+
+
+# Upsample + BatchNorm
+class UpSampleBN(nn.Module):
+ def __init__(self, skip_input, output_features):
+ super(UpSampleBN, self).__init__()
+
+ self._net = nn.Sequential(nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+ nn.BatchNorm2d(output_features),
+ nn.LeakyReLU(),
+ nn.Conv2d(output_features, output_features, kernel_size=3, stride=1, padding=1),
+ nn.BatchNorm2d(output_features),
+ nn.LeakyReLU())
+
+ def forward(self, x, concat_with):
+ up_x = F.interpolate(x, size=[concat_with.size(2), concat_with.size(3)], mode='bilinear', align_corners=True)
+ f = torch.cat([up_x, concat_with], dim=1)
+ return self._net(f)
+
+
+# Upsample + GroupNorm + Weight Standardization
+class UpSampleGN(nn.Module):
+ def __init__(self, skip_input, output_features):
+ super(UpSampleGN, self).__init__()
+
+ self._net = nn.Sequential(Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+ nn.GroupNorm(8, output_features),
+ nn.LeakyReLU(),
+ Conv2d(output_features, output_features, kernel_size=3, stride=1, padding=1),
+ nn.GroupNorm(8, output_features),
+ nn.LeakyReLU())
+
+ def forward(self, x, concat_with):
+ up_x = F.interpolate(x, size=[concat_with.size(2), concat_with.size(3)], mode='bilinear', align_corners=True)
+ f = torch.cat([up_x, concat_with], dim=1)
+ return self._net(f)
+
+
+# Conv2d with weight standardization
+class Conv2d(nn.Conv2d):
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+ padding=0, dilation=1, groups=1, bias=True):
+ super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride,
+ padding, dilation, groups, bias)
+
+ def forward(self, x):
+ weight = self.weight
+ weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2,
+ keepdim=True).mean(dim=3, keepdim=True)
+ weight = weight - weight_mean
+ std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5
+ weight = weight / std.expand_as(weight)
+ return F.conv2d(x, weight, self.bias, self.stride,
+ self.padding, self.dilation, self.groups)
+
+
+# normalize
+def norm_normalize(norm_out):
+ min_kappa = 0.01
+ norm_x, norm_y, norm_z, kappa = torch.split(norm_out, 1, dim=1)
+ norm = torch.sqrt(norm_x ** 2.0 + norm_y ** 2.0 + norm_z ** 2.0) + 1e-10
+ kappa = F.elu(kappa) + 1.0 + min_kappa
+ final_out = torch.cat([norm_x / norm, norm_y / norm, norm_z / norm, kappa], dim=1)
+ return final_out
+
+
+# uncertainty-guided sampling (only used during training)
+@torch.no_grad()
+def sample_points(init_normal, gt_norm_mask, sampling_ratio, beta):
+ device = init_normal.device
+ B, _, H, W = init_normal.shape
+ N = int(sampling_ratio * H * W)
+ beta = beta
+
+ # uncertainty map
+ uncertainty_map = -1 * init_normal[:, 3, :, :] # B, H, W
+
+ # gt_invalid_mask (B, H, W)
+ if gt_norm_mask is not None:
+ gt_invalid_mask = F.interpolate(gt_norm_mask.float(), size=[H, W], mode='nearest')
+ gt_invalid_mask = gt_invalid_mask[:, 0, :, :] < 0.5
+ uncertainty_map[gt_invalid_mask] = -1e4
+
+ # (B, H*W)
+ _, idx = uncertainty_map.view(B, -1).sort(1, descending=True)
+
+ # importance sampling
+ if int(beta * N) > 0:
+ importance = idx[:, :int(beta * N)] # B, beta*N
+
+ # remaining
+ remaining = idx[:, int(beta * N):] # B, H*W - beta*N
+
+ # coverage
+ num_coverage = N - int(beta * N)
+
+ if num_coverage <= 0:
+ samples = importance
+ else:
+ coverage_list = []
+ for i in range(B):
+ idx_c = torch.randperm(remaining.size()[1]) # shuffles "H*W - beta*N"
+ coverage_list.append(remaining[i, :][idx_c[:num_coverage]].view(1, -1)) # 1, N-beta*N
+ coverage = torch.cat(coverage_list, dim=0) # B, N-beta*N
+ samples = torch.cat((importance, coverage), dim=1) # B, N
+
+ else:
+ # remaining
+ remaining = idx[:, :] # B, H*W
+
+ # coverage
+ num_coverage = N
+
+ coverage_list = []
+ for i in range(B):
+ idx_c = torch.randperm(remaining.size()[1]) # shuffles "H*W - beta*N"
+ coverage_list.append(remaining[i, :][idx_c[:num_coverage]].view(1, -1)) # 1, N-beta*N
+ coverage = torch.cat(coverage_list, dim=0) # B, N-beta*N
+ samples = coverage
+
+ # point coordinates
+ rows_int = samples // W # 0 for first row, H-1 for last row
+ rows_float = rows_int / float(H-1) # 0 to 1.0
+ rows_float = (rows_float * 2.0) - 1.0 # -1.0 to 1.0
+
+ cols_int = samples % W # 0 for first column, W-1 for last column
+ cols_float = cols_int / float(W-1) # 0 to 1.0
+ cols_float = (cols_float * 2.0) - 1.0 # -1.0 to 1.0
+
+ point_coords = torch.zeros(B, 1, N, 2)
+ point_coords[:, 0, :, 0] = cols_float # x coord
+ point_coords[:, 0, :, 1] = rows_float # y coord
+ point_coords = point_coords.to(device)
+ return point_coords, rows_int, cols_int
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a60b71f47f9558ed3b895c1e4b32d0c78d05f12
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/__init__.py
@@ -0,0 +1,48 @@
+import os
+from .api import make_detectron2_model, semantic_run
+from pathlib import Path
+import warnings
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+import numpy as np
+import cv2
+from PIL import Image
+
+DEFAULT_CONFIGS = {
+ "coco": {
+ "name": "150_16_swin_l_oneformer_coco_100ep.pth",
+ "config": os.path.join(os.path.dirname(__file__), 'configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml')
+ },
+ "ade20k": {
+ "name": "250_16_swin_l_oneformer_ade20k_160k.pth",
+ "config": os.path.join(os.path.dirname(__file__), 'configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml')
+ }
+}
+class OneformerSegmentor:
+ def __init__(self, model, metadata):
+ self.model = model
+ self.metadata = metadata
+
+ def to(self, device):
+ self.model.model.to(device)
+ return self
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="250_16_swin_l_oneformer_ade20k_160k.pth", config_path = None):
+ config_path = config_path or DEFAULT_CONFIGS["ade20k" if "ade20k" in filename else "coco"]["config"]
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ model, metadata = make_detectron2_model(config_path, model_path)
+
+ return cls(model, metadata)
+
+ def __call__(self, input_image=None, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ detected_map = semantic_run(input_image, self.model, self.metadata)
+ detected_map = remove_pad(HWC3(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/api.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff515b4189bc4324d981be4fb32f1620c3fa922e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/api.py
@@ -0,0 +1,39 @@
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+import torch
+
+from custom_detectron2.config import get_cfg
+from custom_detectron2.projects.deeplab import add_deeplab_config
+from custom_detectron2.data import MetadataCatalog
+
+from custom_oneformer import (
+ add_oneformer_config,
+ add_common_config,
+ add_swin_config,
+ add_dinat_config,
+)
+
+from custom_oneformer.demo.defaults import DefaultPredictor
+from custom_oneformer.demo.visualizer import Visualizer, ColorMode
+
+
+def make_detectron2_model(config_path, ckpt_path):
+ cfg = get_cfg()
+ add_deeplab_config(cfg)
+ add_common_config(cfg)
+ add_swin_config(cfg)
+ add_oneformer_config(cfg)
+ add_dinat_config(cfg)
+ cfg.merge_from_file(config_path)
+ cfg.MODEL.WEIGHTS = ckpt_path
+ cfg.freeze()
+ metadata = MetadataCatalog.get(cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused")
+ return DefaultPredictor(cfg), metadata
+
+
+def semantic_run(img, predictor, metadata):
+ predictions = predictor(img[:, :, ::-1], "semantic") # Predictor of OneFormer must use BGR image !!!
+ visualizer_map = Visualizer(img, is_img=False, metadata=metadata, instance_mode=ColorMode.IMAGE)
+ out_map = visualizer_map.draw_sem_seg(predictions["sem_seg"].argmax(dim=0).cpu(), alpha=1, is_text=False).get_image()
+ return out_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9a308b11fd3722f6ee25a379018f46beed008d1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml
@@ -0,0 +1,68 @@
+MODEL:
+ BACKBONE:
+ FREEZE_AT: 0
+ NAME: "build_resnet_backbone"
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
+ PIXEL_STD: [58.395, 57.120, 57.375]
+ RESNETS:
+ DEPTH: 50
+ STEM_TYPE: "basic" # not used
+ STEM_OUT_CHANNELS: 64
+ STRIDE_IN_1X1: False
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+ # NORM: "SyncBN"
+ RES5_MULTI_GRID: [1, 1, 1] # not used
+DATASETS:
+ TRAIN: ("ade20k_panoptic_train",)
+ TEST_PANOPTIC: ("ade20k_panoptic_val",)
+ TEST_INSTANCE: ("ade20k_instance_val",)
+ TEST_SEMANTIC: ("ade20k_sem_seg_val",)
+SOLVER:
+ IMS_PER_BATCH: 16
+ BASE_LR: 0.0001
+ MAX_ITER: 160000
+ WARMUP_FACTOR: 1.0
+ WARMUP_ITERS: 0
+ WEIGHT_DECAY: 0.05
+ OPTIMIZER: "ADAMW"
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
+ BACKBONE_MULTIPLIER: 0.1
+ CLIP_GRADIENTS:
+ ENABLED: True
+ CLIP_TYPE: "full_model"
+ CLIP_VALUE: 0.01
+ NORM_TYPE: 2.0
+ AMP:
+ ENABLED: True
+INPUT:
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
+ MIN_SIZE_TEST: 512
+ MAX_SIZE_TRAIN: 2048
+ MAX_SIZE_TEST: 2048
+ CROP:
+ ENABLED: True
+ TYPE: "absolute"
+ SIZE: (512, 512)
+ SINGLE_CATEGORY_MAX_AREA: 1.0
+ COLOR_AUG_SSD: True
+ SIZE_DIVISIBILITY: 512 # used in dataset mapper
+ FORMAT: "RGB"
+ DATASET_MAPPER_NAME: "oneformer_unified"
+ MAX_SEQ_LEN: 77
+ TASK_SEQ_LEN: 77
+ TASK_PROB:
+ SEMANTIC: 0.33
+ INSTANCE: 0.66
+TEST:
+ EVAL_PERIOD: 5000
+ AUG:
+ ENABLED: False
+ MIN_SIZES: [256, 384, 512, 640, 768, 896]
+ MAX_SIZE: 3584
+ FLIP: True
+DATALOADER:
+ FILTER_EMPTY_ANNOTATIONS: True
+ NUM_WORKERS: 4
+VERSION: 2
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/oneformer_R50_bs16_160k.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/oneformer_R50_bs16_160k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e168c09a388e136435ed69452621b66effbce599
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/oneformer_R50_bs16_160k.yaml
@@ -0,0 +1,58 @@
+_BASE_: Base-ADE20K-UnifiedSegmentation.yaml
+MODEL:
+ META_ARCHITECTURE: "OneFormer"
+ SEM_SEG_HEAD:
+ NAME: "OneFormerHead"
+ IGNORE_VALUE: 255
+ NUM_CLASSES: 150
+ LOSS_WEIGHT: 1.0
+ CONVS_DIM: 256
+ MASK_DIM: 256
+ NORM: "GN"
+ # pixel decoder
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+ COMMON_STRIDE: 4
+ TRANSFORMER_ENC_LAYERS: 6
+ ONE_FORMER:
+ TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+ DEEP_SUPERVISION: True
+ NO_OBJECT_WEIGHT: 0.1
+ CLASS_WEIGHT: 2.0
+ MASK_WEIGHT: 5.0
+ DICE_WEIGHT: 5.0
+ CONTRASTIVE_WEIGHT: 0.5
+ CONTRASTIVE_TEMPERATURE: 0.07
+ HIDDEN_DIM: 256
+ NUM_OBJECT_QUERIES: 150
+ USE_TASK_NORM: True
+ NHEADS: 8
+ DROPOUT: 0.1
+ DIM_FEEDFORWARD: 2048
+ ENC_LAYERS: 0
+ PRE_NORM: False
+ ENFORCE_INPUT_PROJ: False
+ SIZE_DIVISIBILITY: 32
+ CLASS_DEC_LAYERS: 2
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
+ TRAIN_NUM_POINTS: 12544
+ OVERSAMPLE_RATIO: 3.0
+ IMPORTANCE_SAMPLE_RATIO: 0.75
+ TEXT_ENCODER:
+ WIDTH: 256
+ CONTEXT_LENGTH: 77
+ NUM_LAYERS: 6
+ VOCAB_SIZE: 49408
+ PROJ_NUM_LAYERS: 2
+ N_CTX: 16
+ TEST:
+ SEMANTIC_ON: True
+ INSTANCE_ON: True
+ PANOPTIC_ON: True
+ OVERLAP_THRESHOLD: 0.8
+ OBJECT_MASK_THRESHOLD: 0.8
+ TASK: "panoptic"
+TEST:
+ DETECTIONS_PER_IMAGE: 150
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a2a8f6983091e582a12d3b276f440a824a1db35
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml
@@ -0,0 +1,40 @@
+_BASE_: oneformer_R50_bs16_160k.yaml
+MODEL:
+ BACKBONE:
+ NAME: "D2SwinTransformer"
+ SWIN:
+ EMBED_DIM: 192
+ DEPTHS: [2, 2, 18, 2]
+ NUM_HEADS: [6, 12, 24, 48]
+ WINDOW_SIZE: 12
+ APE: False
+ DROP_PATH_RATE: 0.3
+ PATCH_NORM: True
+ PRETRAIN_IMG_SIZE: 384
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
+ PIXEL_STD: [58.395, 57.120, 57.375]
+ ONE_FORMER:
+ NUM_OBJECT_QUERIES: 250
+INPUT:
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
+ MIN_SIZE_TEST: 640
+ MAX_SIZE_TRAIN: 2560
+ MAX_SIZE_TEST: 2560
+ CROP:
+ ENABLED: True
+ TYPE: "absolute"
+ SIZE: (640, 640)
+ SINGLE_CATEGORY_MAX_AREA: 1.0
+ COLOR_AUG_SSD: True
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
+ FORMAT: "RGB"
+TEST:
+ DETECTIONS_PER_IMAGE: 250
+ EVAL_PERIOD: 5000
+ AUG:
+ ENABLED: False
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+ MAX_SIZE: 4480
+ FLIP: True
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/Base-COCO-UnifiedSegmentation.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/Base-COCO-UnifiedSegmentation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fffec379393aead5a258073d7451d6de816c5f8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/Base-COCO-UnifiedSegmentation.yaml
@@ -0,0 +1,54 @@
+MODEL:
+ BACKBONE:
+ FREEZE_AT: 0
+ NAME: "build_resnet_backbone"
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
+ PIXEL_STD: [58.395, 57.120, 57.375]
+ RESNETS:
+ DEPTH: 50
+ STEM_TYPE: "basic" # not used
+ STEM_OUT_CHANNELS: 64
+ STRIDE_IN_1X1: False
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+ # NORM: "SyncBN"
+ RES5_MULTI_GRID: [1, 1, 1] # not used
+DATASETS:
+ TRAIN: ("coco_2017_train_panoptic_with_sem_seg",)
+ TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well
+ TEST_INSTANCE: ("coco_2017_val",)
+ TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",)
+SOLVER:
+ IMS_PER_BATCH: 16
+ BASE_LR: 0.0001
+ STEPS: (327778, 355092)
+ MAX_ITER: 368750
+ WARMUP_FACTOR: 1.0
+ WARMUP_ITERS: 10
+ WEIGHT_DECAY: 0.05
+ OPTIMIZER: "ADAMW"
+ BACKBONE_MULTIPLIER: 0.1
+ CLIP_GRADIENTS:
+ ENABLED: True
+ CLIP_TYPE: "full_model"
+ CLIP_VALUE: 0.01
+ NORM_TYPE: 2.0
+ AMP:
+ ENABLED: True
+INPUT:
+ IMAGE_SIZE: 1024
+ MIN_SCALE: 0.1
+ MAX_SCALE: 2.0
+ FORMAT: "RGB"
+ DATASET_MAPPER_NAME: "coco_unified_lsj"
+ MAX_SEQ_LEN: 77
+ TASK_SEQ_LEN: 77
+ TASK_PROB:
+ SEMANTIC: 0.33
+ INSTANCE: 0.66
+TEST:
+ EVAL_PERIOD: 5000
+DATALOADER:
+ FILTER_EMPTY_ANNOTATIONS: True
+ NUM_WORKERS: 4
+VERSION: 2
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/oneformer_R50_bs16_50ep.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/oneformer_R50_bs16_50ep.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1590e52f6ef4add3783a114daee0f0eefd4d2582
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/oneformer_R50_bs16_50ep.yaml
@@ -0,0 +1,59 @@
+_BASE_: Base-COCO-UnifiedSegmentation.yaml
+MODEL:
+ META_ARCHITECTURE: "OneFormer"
+ SEM_SEG_HEAD:
+ NAME: "OneFormerHead"
+ IGNORE_VALUE: 255
+ NUM_CLASSES: 133
+ LOSS_WEIGHT: 1.0
+ CONVS_DIM: 256
+ MASK_DIM: 256
+ NORM: "GN"
+ # pixel decoder
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+ COMMON_STRIDE: 4
+ TRANSFORMER_ENC_LAYERS: 6
+ ONE_FORMER:
+ TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+ DEEP_SUPERVISION: True
+ NO_OBJECT_WEIGHT: 0.1
+ CLASS_WEIGHT: 2.0
+ MASK_WEIGHT: 5.0
+ DICE_WEIGHT: 5.0
+ CONTRASTIVE_WEIGHT: 0.5
+ CONTRASTIVE_TEMPERATURE: 0.07
+ HIDDEN_DIM: 256
+ NUM_OBJECT_QUERIES: 150
+ USE_TASK_NORM: True
+ NHEADS: 8
+ DROPOUT: 0.1
+ DIM_FEEDFORWARD: 2048
+ ENC_LAYERS: 0
+ PRE_NORM: False
+ ENFORCE_INPUT_PROJ: False
+ SIZE_DIVISIBILITY: 32
+ CLASS_DEC_LAYERS: 2
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
+ TRAIN_NUM_POINTS: 12544
+ OVERSAMPLE_RATIO: 3.0
+ IMPORTANCE_SAMPLE_RATIO: 0.75
+ TEXT_ENCODER:
+ WIDTH: 256
+ CONTEXT_LENGTH: 77
+ NUM_LAYERS: 6
+ VOCAB_SIZE: 49408
+ PROJ_NUM_LAYERS: 2
+ N_CTX: 16
+ TEST:
+ SEMANTIC_ON: True
+ INSTANCE_ON: True
+ PANOPTIC_ON: True
+ DETECTION_ON: False
+ OVERLAP_THRESHOLD: 0.8
+ OBJECT_MASK_THRESHOLD: 0.8
+ TASK: "panoptic"
+TEST:
+ DETECTIONS_PER_IMAGE: 150
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e569a8d7d10a967ee636e843e457e47fc79fbec9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/oneformer/configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml
@@ -0,0 +1,25 @@
+_BASE_: oneformer_R50_bs16_50ep.yaml
+MODEL:
+ BACKBONE:
+ NAME: "D2SwinTransformer"
+ SWIN:
+ EMBED_DIM: 192
+ DEPTHS: [2, 2, 18, 2]
+ NUM_HEADS: [6, 12, 24, 48]
+ WINDOW_SIZE: 12
+ APE: False
+ DROP_PATH_RATE: 0.3
+ PATCH_NORM: True
+ PRETRAIN_IMG_SIZE: 384
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
+ PIXEL_STD: [58.395, 57.120, 57.375]
+ ONE_FORMER:
+ NUM_OBJECT_QUERIES: 150
+SOLVER:
+ STEPS: (655556, 735184)
+ MAX_ITER: 737500
+ AMP:
+ ENABLED: False
+TEST:
+ DETECTIONS_PER_IMAGE: 150
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9df4f7f1b9e35ee40d387e765cebdc7d2af06a5e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/LICENSE
@@ -0,0 +1,108 @@
+OPENPOSE: MULTIPERSON KEYPOINT DETECTION
+SOFTWARE LICENSE AGREEMENT
+ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+
+BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+
+This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
+
+RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
+Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
+non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
+
+CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
+
+COPYRIGHT: The Software is owned by Licensor and is protected by United
+States copyright laws and applicable international treaties and/or conventions.
+
+PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
+
+DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
+
+BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
+
+USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
+
+You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
+
+ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
+
+TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
+
+The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
+
+FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
+
+DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
+
+SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
+
+EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
+
+EXPORT REGULATION: Licensee agrees to comply with any and all applicable
+U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
+
+SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
+
+NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
+
+GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
+
+ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
+
+
+
+************************************************************************
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
+
+1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..124095a3211426be8a499c255413e6808c644535
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__init__.py
@@ -0,0 +1,238 @@
+# Openpose
+# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
+# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
+# 3rd Edited by ControlNet
+# 4th Edited by ControlNet (added face and correct hands)
+# 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
+# This preprocessor is licensed by CMU for non-commercial use only.
+
+
+import os
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+import json
+import warnings
+from typing import Callable, List, NamedTuple, Tuple, Union
+
+import cv2
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+from . import util
+from .body import Body, BodyResult, Keypoint
+from .face import Face
+from .hand import Hand
+
+HandResult = List[Keypoint]
+FaceResult = List[Keypoint]
+
+class PoseResult(NamedTuple):
+ body: BodyResult
+ left_hand: Union[HandResult, None]
+ right_hand: Union[HandResult, None]
+ face: Union[FaceResult, None]
+
+def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True, xinsr_stick_scaling=False):
+ """
+ Draw the detected poses on an empty canvas.
+
+ Args:
+ poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
+ H (int): The height of the canvas.
+ W (int): The width of the canvas.
+ draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
+ draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
+ draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
+
+ Returns:
+ numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
+ """
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
+
+ for pose in poses:
+ if draw_body:
+ canvas = util.draw_bodypose(canvas, pose.body.keypoints, xinsr_stick_scaling)
+
+ if draw_hand:
+ canvas = util.draw_handpose(canvas, pose.left_hand)
+ canvas = util.draw_handpose(canvas, pose.right_hand)
+
+ if draw_face:
+ canvas = util.draw_facepose(canvas, pose.face)
+
+ return canvas
+
+def encode_poses_as_dict(poses: List[PoseResult], canvas_height: int, canvas_width: int) -> str:
+ """ Encode the pose as a dict following openpose JSON output format:
+ https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md
+ """
+ def compress_keypoints(keypoints: Union[List[Keypoint], None]) -> Union[List[float], None]:
+ if not keypoints:
+ return None
+
+ return [
+ value
+ for keypoint in keypoints
+ for value in (
+ [float(keypoint.x), float(keypoint.y), 1.0]
+ if keypoint is not None
+ else [0.0, 0.0, 0.0]
+ )
+ ]
+
+ return {
+ 'people': [
+ {
+ 'pose_keypoints_2d': compress_keypoints(pose.body.keypoints),
+ "face_keypoints_2d": compress_keypoints(pose.face),
+ "hand_left_keypoints_2d": compress_keypoints(pose.left_hand),
+ "hand_right_keypoints_2d":compress_keypoints(pose.right_hand),
+ }
+ for pose in poses
+ ],
+ 'canvas_height': canvas_height,
+ 'canvas_width': canvas_width,
+ }
+
+class OpenposeDetector:
+ """
+ A class for detecting human poses in images using the Openpose model.
+
+ Attributes:
+ model_dir (str): Path to the directory where the pose models are stored.
+ """
+ def __init__(self, body_estimation, hand_estimation=None, face_estimation=None):
+ self.body_estimation = body_estimation
+ self.hand_estimation = hand_estimation
+ self.face_estimation = face_estimation
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="body_pose_model.pth", hand_filename="hand_pose_model.pth", face_filename="facenet.pth"):
+ if pretrained_model_or_path == "lllyasviel/ControlNet":
+ subfolder = "annotator/ckpts"
+ face_pretrained_model_or_path = "lllyasviel/Annotators"
+
+ else:
+ subfolder = ''
+ face_pretrained_model_or_path = pretrained_model_or_path
+
+ body_model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder=subfolder)
+ hand_model_path = custom_hf_download(pretrained_model_or_path, hand_filename, subfolder=subfolder)
+ face_model_path = custom_hf_download(face_pretrained_model_or_path, face_filename, subfolder=subfolder)
+
+ body_estimation = Body(body_model_path)
+ hand_estimation = Hand(hand_model_path)
+ face_estimation = Face(face_model_path)
+
+ return cls(body_estimation, hand_estimation, face_estimation)
+
+ def to(self, device):
+ self.body_estimation.to(device)
+ self.hand_estimation.to(device)
+ self.face_estimation.to(device)
+ return self
+
+ def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
+ left_hand = None
+ right_hand = None
+ H, W, _ = oriImg.shape
+ for x, y, w, is_left in util.handDetect(body, oriImg):
+ peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
+
+ hand_result = [
+ Keypoint(x=peak[0], y=peak[1])
+ for peak in peaks
+ ]
+
+ if is_left:
+ left_hand = hand_result
+ else:
+ right_hand = hand_result
+
+ return left_hand, right_hand
+
+ def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
+ face = util.faceDetect(body, oriImg)
+ if face is None:
+ return None
+
+ x, y, w = face
+ H, W, _ = oriImg.shape
+ heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
+ peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
+ return [
+ Keypoint(x=peak[0], y=peak[1])
+ for peak in peaks
+ ]
+
+ return None
+
+ def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
+ """
+ Detect poses in the given image.
+ Args:
+ oriImg (numpy.ndarray): The input image for pose detection.
+ include_hand (bool, optional): Whether to include hand detection. Defaults to False.
+ include_face (bool, optional): Whether to include face detection. Defaults to False.
+
+ Returns:
+ List[PoseResult]: A list of PoseResult objects containing the detected poses.
+ """
+ oriImg = oriImg[:, :, ::-1].copy()
+ H, W, C = oriImg.shape
+ with torch.no_grad():
+ candidate, subset = self.body_estimation(oriImg)
+ bodies = self.body_estimation.format_body_result(candidate, subset)
+
+ results = []
+ for body in bodies:
+ left_hand, right_hand, face = (None,) * 3
+ if include_hand:
+ left_hand, right_hand = self.detect_hands(body, oriImg)
+ if include_face:
+ face = self.detect_face(body, oriImg)
+
+ results.append(PoseResult(BodyResult(
+ keypoints=[
+ Keypoint(
+ x=keypoint.x / float(W),
+ y=keypoint.y / float(H)
+ ) if keypoint is not None else None
+ for keypoint in body.keypoints
+ ],
+ total_score=body.total_score,
+ total_parts=body.total_parts
+ ), left_hand, right_hand, face))
+
+ return results
+
+ def __call__(self, input_image, detect_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", image_and_json=False, upscale_method="INTER_CUBIC", xinsr_stick_scaling=False, **kwargs):
+ if hand_and_face is not None:
+ warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
+ include_hand = hand_and_face
+ include_face = hand_and_face
+
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ poses = self.detect_poses(input_image, include_hand=include_hand, include_face=include_face)
+ canvas = draw_poses(poses, input_image.shape[0], input_image.shape[1], draw_body=include_body, draw_hand=include_hand, draw_face=include_face, xinsr_stick_scaling=xinsr_stick_scaling)
+ detected_map = HWC3(remove_pad(canvas))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ if image_and_json:
+ return (detected_map, encode_poses_as_dict(poses, detected_map.shape[0], detected_map.shape[1]))
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/__init__.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eed9bb27054b2e3abf9c2686abaac85c8645515
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/__init__.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/body.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/body.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7a14c3bb2664cacc7c49dda5829d3d819f7a210
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/body.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/face.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/face.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..349258df86cd4475ec27b4592e7469e59370be1c
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/face.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/hand.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/hand.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c17f0cfcadcaa56916a91a7716c6f7a804537060
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/hand.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/model.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..120e26df867b5c6d226d8e208eb2ca4728dc4b68
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/model.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/util.cpython-312.pyc b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/util.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..309fc414f4562407188ec1341ac3589e01413f9d
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/__pycache__/util.cpython-312.pyc differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/body.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/body.py
new file mode 100644
index 0000000000000000000000000000000000000000..513320df1d8b1e1f18c598c6b44a6b6f45894f24
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/body.py
@@ -0,0 +1,278 @@
+import math
+from typing import List, NamedTuple, Union
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from scipy.ndimage.filters import gaussian_filter
+
+from . import util
+from .model import bodypose_model
+
+
+class Keypoint(NamedTuple):
+ x: float
+ y: float
+ score: float = 1.0
+ id: int = -1
+
+
+class BodyResult(NamedTuple):
+ # Note: Using `Union` instead of `|` operator as the ladder is a Python
+ # 3.10 feature.
+ # Annotator code should be Python 3.8 Compatible, as controlnet repo uses
+ # Python 3.8 environment.
+ # https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
+ keypoints: List[Union[Keypoint, None]]
+ total_score: float
+ total_parts: int
+
+
+class Body(object):
+ def __init__(self, model_path):
+ self.model = bodypose_model()
+ model_dict = util.transfer(self.model, torch.load(model_path))
+ self.model.load_state_dict(model_dict)
+ self.model.eval()
+ self.device = "cpu"
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, oriImg):
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
+ scale_search = [0.5]
+ boxsize = 368
+ stride = 8
+ padValue = 128
+ thre1 = 0.1
+ thre2 = 0.05
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+ for m in range(len(multiplier)):
+ scale = multiplier[m]
+ imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+ im = np.ascontiguousarray(im)
+
+ data = torch.from_numpy(im).float()
+ data = data.to(self.device)
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+ with torch.no_grad():
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+ # extract outputs, resize, and remove padding
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
+
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
+ paf = util.smart_resize_k(paf, fx=stride, fy=stride)
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
+
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+ paf_avg += + paf / len(multiplier)
+
+ all_peaks = []
+ peak_counter = 0
+
+ for part in range(18):
+ map_ori = heatmap_avg[:, :, part]
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+ map_left = np.zeros(one_heatmap.shape)
+ map_left[1:, :] = one_heatmap[:-1, :]
+ map_right = np.zeros(one_heatmap.shape)
+ map_right[:-1, :] = one_heatmap[1:, :]
+ map_up = np.zeros(one_heatmap.shape)
+ map_up[:, 1:] = one_heatmap[:, :-1]
+ map_down = np.zeros(one_heatmap.shape)
+ map_down[:, :-1] = one_heatmap[:, 1:]
+
+ peaks_binary = np.logical_and.reduce(
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
+ peak_id = range(peak_counter, peak_counter + len(peaks))
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
+
+ all_peaks.append(peaks_with_score_and_id)
+ peak_counter += len(peaks)
+
+ # find connection in the specified sequence, center 29 is in the position 15
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+ [1, 16], [16, 18], [3, 17], [6, 18]]
+ # the middle joints heatmap correpondence
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
+ [55, 56], [37, 38], [45, 46]]
+
+ connection_all = []
+ special_k = []
+ mid_num = 10
+
+ for k in range(len(mapIdx)):
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+ candA = all_peaks[limbSeq[k][0] - 1]
+ candB = all_peaks[limbSeq[k][1] - 1]
+ nA = len(candA)
+ nB = len(candB)
+ indexA, indexB = limbSeq[k]
+ if (nA != 0 and nB != 0):
+ connection_candidate = []
+ for i in range(nA):
+ for j in range(nB):
+ vec = np.subtract(candB[j][:2], candA[i][:2])
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+ norm = max(0.001, norm)
+ vec = np.divide(vec, norm)
+
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
+
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
+ for I in range(len(startend))])
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
+ for I in range(len(startend))])
+
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+ criterion2 = score_with_dist_prior > 0
+ if criterion1 and criterion2:
+ connection_candidate.append(
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
+
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
+ connection = np.zeros((0, 5))
+ for c in range(len(connection_candidate)):
+ i, j, s = connection_candidate[c][0:3]
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
+ if (len(connection) >= min(nA, nB)):
+ break
+
+ connection_all.append(connection)
+ else:
+ special_k.append(k)
+ connection_all.append([])
+
+ # last number in each row is the total parts number of that person
+ # the second last number in each row is the score of the overall configuration
+ subset = -1 * np.ones((0, 20))
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
+
+ for k in range(len(mapIdx)):
+ if k not in special_k:
+ partAs = connection_all[k][:, 0]
+ partBs = connection_all[k][:, 1]
+ indexA, indexB = np.array(limbSeq[k]) - 1
+
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
+ found = 0
+ subset_idx = [-1, -1]
+ for j in range(len(subset)): # 1:size(subset,1):
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
+ subset_idx[found] = j
+ found += 1
+
+ if found == 1:
+ j = subset_idx[0]
+ if subset[j][indexB] != partBs[i]:
+ subset[j][indexB] = partBs[i]
+ subset[j][-1] += 1
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+ elif found == 2: # if found 2 and disjoint, merge them
+ j1, j2 = subset_idx
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
+ subset[j1][-2:] += subset[j2][-2:]
+ subset[j1][-2] += connection_all[k][i][2]
+ subset = np.delete(subset, j2, 0)
+ else: # as like found == 1
+ subset[j1][indexB] = partBs[i]
+ subset[j1][-1] += 1
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+
+ # if find no partA in the subset, create a new subset
+ elif not found and k < 17:
+ row = -1 * np.ones(20)
+ row[indexA] = partAs[i]
+ row[indexB] = partBs[i]
+ row[-1] = 2
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
+ subset = np.vstack([subset, row])
+ # delete some rows of subset which has few parts occur
+ deleteIdx = []
+ for i in range(len(subset)):
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+ deleteIdx.append(i)
+ subset = np.delete(subset, deleteIdx, axis=0)
+
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+ # candidate: x, y, score, id
+ return candidate, subset
+
+ @staticmethod
+ def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
+ """
+ Format the body results from the candidate and subset arrays into a list of BodyResult objects.
+
+ Args:
+ candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
+ for each body part.
+ subset (np.ndarray): An array of subsets containing indices to the candidate array for each
+ person detected. The last two columns of each row hold the total score and total parts
+ of the person.
+
+ Returns:
+ List[BodyResult]: A list of BodyResult objects, where each object represents a person with
+ detected keypoints, total score, and total parts.
+ """
+ return [
+ BodyResult(
+ keypoints=[
+ Keypoint(
+ x=candidate[candidate_index][0],
+ y=candidate[candidate_index][1],
+ score=candidate[candidate_index][2],
+ id=candidate[candidate_index][3]
+ ) if candidate_index != -1 else None
+ for candidate_index in person[:18].astype(int)
+ ],
+ total_score=person[18],
+ total_parts=person[19]
+ )
+ for person in subset
+ ]
+
+
+if __name__ == "__main__":
+ body_estimation = Body('../model/body_pose_model.pth')
+
+ test_image = '../images/ski.jpg'
+ oriImg = cv2.imread(test_image) # B,G,R order
+ candidate, subset = body_estimation(oriImg)
+ bodies = body_estimation.format_body_result(candidate, subset)
+
+ canvas = oriImg
+ for body in bodies:
+ canvas = util.draw_bodypose(canvas, body)
+
+ plt.imshow(canvas[:, :, [2, 1, 0]])
+ plt.show()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/face.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/face.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb45c338ca6c7577ccef0a4996f8adb63d9fb15b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/face.py
@@ -0,0 +1,365 @@
+import logging
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.nn import Conv2d, MaxPool2d, Module, ReLU, init
+from torchvision.transforms import ToPILImage, ToTensor
+
+from . import util
+
+
+class FaceNet(Module):
+ """Model the cascading heatmaps. """
+ def __init__(self):
+ super(FaceNet, self).__init__()
+ # cnn to make feature map
+ self.relu = ReLU()
+ self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
+ self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
+ kernel_size=3, stride=1, padding=1)
+ self.conv1_2 = Conv2d(
+ in_channels=64, out_channels=64, kernel_size=3, stride=1,
+ padding=1)
+ self.conv2_1 = Conv2d(
+ in_channels=64, out_channels=128, kernel_size=3, stride=1,
+ padding=1)
+ self.conv2_2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_1 = Conv2d(
+ in_channels=128, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_2 = Conv2d(
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_3 = Conv2d(
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv3_4 = Conv2d(
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_1 = Conv2d(
+ in_channels=256, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_2 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_3 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv4_4 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv5_1 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv5_2 = Conv2d(
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
+ padding=1)
+ self.conv5_3_CPM = Conv2d(
+ in_channels=512, out_channels=128, kernel_size=3, stride=1,
+ padding=1)
+
+ # stage1
+ self.conv6_1_CPM = Conv2d(
+ in_channels=128, out_channels=512, kernel_size=1, stride=1,
+ padding=0)
+ self.conv6_2_CPM = Conv2d(
+ in_channels=512, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage2
+ self.Mconv1_stage2 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage2 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage2 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage3
+ self.Mconv1_stage3 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage3 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage3 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage4
+ self.Mconv1_stage4 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage4 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage4 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage5
+ self.Mconv1_stage5 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage5 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage5 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ # stage6
+ self.Mconv1_stage6 = Conv2d(
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv2_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv3_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv4_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv5_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
+ padding=3)
+ self.Mconv6_stage6 = Conv2d(
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
+ padding=0)
+ self.Mconv7_stage6 = Conv2d(
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
+ padding=0)
+
+ for m in self.modules():
+ if isinstance(m, Conv2d):
+ init.constant_(m.bias, 0)
+
+ def forward(self, x):
+ """Return a list of heatmaps."""
+ heatmaps = []
+
+ h = self.relu(self.conv1_1(x))
+ h = self.relu(self.conv1_2(h))
+ h = self.max_pooling_2d(h)
+ h = self.relu(self.conv2_1(h))
+ h = self.relu(self.conv2_2(h))
+ h = self.max_pooling_2d(h)
+ h = self.relu(self.conv3_1(h))
+ h = self.relu(self.conv3_2(h))
+ h = self.relu(self.conv3_3(h))
+ h = self.relu(self.conv3_4(h))
+ h = self.max_pooling_2d(h)
+ h = self.relu(self.conv4_1(h))
+ h = self.relu(self.conv4_2(h))
+ h = self.relu(self.conv4_3(h))
+ h = self.relu(self.conv4_4(h))
+ h = self.relu(self.conv5_1(h))
+ h = self.relu(self.conv5_2(h))
+ h = self.relu(self.conv5_3_CPM(h))
+ feature_map = h
+
+ # stage1
+ h = self.relu(self.conv6_1_CPM(h))
+ h = self.conv6_2_CPM(h)
+ heatmaps.append(h)
+
+ # stage2
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage2(h))
+ h = self.relu(self.Mconv2_stage2(h))
+ h = self.relu(self.Mconv3_stage2(h))
+ h = self.relu(self.Mconv4_stage2(h))
+ h = self.relu(self.Mconv5_stage2(h))
+ h = self.relu(self.Mconv6_stage2(h))
+ h = self.Mconv7_stage2(h)
+ heatmaps.append(h)
+
+ # stage3
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage3(h))
+ h = self.relu(self.Mconv2_stage3(h))
+ h = self.relu(self.Mconv3_stage3(h))
+ h = self.relu(self.Mconv4_stage3(h))
+ h = self.relu(self.Mconv5_stage3(h))
+ h = self.relu(self.Mconv6_stage3(h))
+ h = self.Mconv7_stage3(h)
+ heatmaps.append(h)
+
+ # stage4
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage4(h))
+ h = self.relu(self.Mconv2_stage4(h))
+ h = self.relu(self.Mconv3_stage4(h))
+ h = self.relu(self.Mconv4_stage4(h))
+ h = self.relu(self.Mconv5_stage4(h))
+ h = self.relu(self.Mconv6_stage4(h))
+ h = self.Mconv7_stage4(h)
+ heatmaps.append(h)
+
+ # stage5
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage5(h))
+ h = self.relu(self.Mconv2_stage5(h))
+ h = self.relu(self.Mconv3_stage5(h))
+ h = self.relu(self.Mconv4_stage5(h))
+ h = self.relu(self.Mconv5_stage5(h))
+ h = self.relu(self.Mconv6_stage5(h))
+ h = self.Mconv7_stage5(h)
+ heatmaps.append(h)
+
+ # stage6
+ h = torch.cat([h, feature_map], dim=1) # channel concat
+ h = self.relu(self.Mconv1_stage6(h))
+ h = self.relu(self.Mconv2_stage6(h))
+ h = self.relu(self.Mconv3_stage6(h))
+ h = self.relu(self.Mconv4_stage6(h))
+ h = self.relu(self.Mconv5_stage6(h))
+ h = self.relu(self.Mconv6_stage6(h))
+ h = self.Mconv7_stage6(h)
+ heatmaps.append(h)
+
+ return heatmaps
+
+
+LOG = logging.getLogger(__name__)
+TOTEN = ToTensor()
+TOPIL = ToPILImage()
+
+
+params = {
+ 'gaussian_sigma': 2.5,
+ 'inference_img_size': 736, # 368, 736, 1312
+ 'heatmap_peak_thresh': 0.1,
+ 'crop_scale': 1.5,
+ 'line_indices': [
+ [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
+ [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
+ [13, 14], [14, 15], [15, 16],
+ [17, 18], [18, 19], [19, 20], [20, 21],
+ [22, 23], [23, 24], [24, 25], [25, 26],
+ [27, 28], [28, 29], [29, 30],
+ [31, 32], [32, 33], [33, 34], [34, 35],
+ [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
+ [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
+ [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
+ [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
+ [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
+ [66, 67], [67, 60]
+ ],
+}
+
+
+class Face(object):
+ """
+ The OpenPose face landmark detector model.
+
+ Args:
+ inference_size: set the size of the inference image size, suggested:
+ 368, 736, 1312, default 736
+ gaussian_sigma: blur the heatmaps, default 2.5
+ heatmap_peak_thresh: return landmark if over threshold, default 0.1
+
+ """
+ def __init__(self, face_model_path,
+ inference_size=None,
+ gaussian_sigma=None,
+ heatmap_peak_thresh=None):
+ self.inference_size = inference_size or params["inference_img_size"]
+ self.sigma = gaussian_sigma or params['gaussian_sigma']
+ self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
+ self.model = FaceNet()
+ self.model.load_state_dict(torch.load(face_model_path))
+ self.model.eval()
+ self.device = "cpu"
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, face_img):
+ H, W, C = face_img.shape
+
+ w_size = 384
+ x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
+
+ x_data = x_data.to(self.device)
+
+ with torch.no_grad():
+ hs = self.model(x_data[None, ...])
+ heatmaps = F.interpolate(
+ hs[-1],
+ (H, W),
+ mode='bilinear', align_corners=True).cpu().numpy()[0]
+ return heatmaps
+
+ def compute_peaks_from_heatmaps(self, heatmaps):
+ all_peaks = []
+ for part in range(heatmaps.shape[0]):
+ map_ori = heatmaps[part].copy()
+ binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
+
+ if np.sum(binary) == 0:
+ continue
+
+ positions = np.where(binary > 0.5)
+ intensities = map_ori[positions]
+ mi = np.argmax(intensities)
+ y, x = positions[0][mi], positions[1][mi]
+ all_peaks.append([x, y])
+
+ return np.array(all_peaks)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/hand.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f82b2feef658e085f9a1bdc361f6af6db82b8c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/hand.py
@@ -0,0 +1,91 @@
+import cv2
+import numpy as np
+import torch
+from scipy.ndimage.filters import gaussian_filter
+from skimage.measure import label
+
+from . import util
+from .model import handpose_model
+
+
+class Hand(object):
+ def __init__(self, model_path):
+ self.model = handpose_model()
+ model_dict = util.transfer(self.model, torch.load(model_path))
+ self.model.load_state_dict(model_dict)
+ self.model.eval()
+ self.device = "cpu"
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, oriImgRaw):
+ scale_search = [0.5, 1.0, 1.5, 2.0]
+ # scale_search = [0.5]
+ boxsize = 368
+ stride = 8
+ padValue = 128
+ thre = 0.05
+ multiplier = [x * boxsize for x in scale_search]
+
+ wsize = 128
+ heatmap_avg = np.zeros((wsize, wsize, 22))
+
+ Hr, Wr, Cr = oriImgRaw.shape
+
+ oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
+
+ for m in range(len(multiplier)):
+ scale = multiplier[m]
+ imageToTest = util.smart_resize(oriImg, (scale, scale))
+
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+ im = np.ascontiguousarray(im)
+
+ data = torch.from_numpy(im).float()
+ data = data.to(self.device)
+
+ with torch.no_grad():
+ output = self.model(data).cpu().numpy()
+
+ # extract outputs, resize, and remove padding
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ heatmap = util.smart_resize(heatmap, (wsize, wsize))
+
+ heatmap_avg += heatmap / len(multiplier)
+
+ all_peaks = []
+ for part in range(21):
+ map_ori = heatmap_avg[:, :, part]
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+
+ if np.sum(binary) == 0:
+ all_peaks.append([0, 0])
+ continue
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
+ label_img[label_img != max_index] = 0
+ map_ori[label_img == 0] = 0
+
+ y, x = util.npmax(map_ori)
+ y = int(float(y) * float(Hr) / float(wsize))
+ x = int(float(x) * float(Wr) / float(wsize))
+ all_peaks.append([x, y])
+ return np.array(all_peaks)
+
+if __name__ == "__main__":
+ hand_estimation = Hand('../model/hand_pose_model.pth')
+
+ # test_image = '../images/hand.jpg'
+ test_image = '../images/hand.jpg'
+ oriImg = cv2.imread(test_image) # B,G,R order
+ peaks = hand_estimation(oriImg)
+ canvas = util.draw_handpose(oriImg, peaks, True)
+ cv2.imshow('', canvas)
+ cv2.waitKey(0)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed8c60de316f9dd591bfee28f86401012522526
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/model.py
@@ -0,0 +1,217 @@
+import torch
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+def make_layers(block, no_relu_layers):
+ layers = []
+ for layer_name, v in block.items():
+ if 'pool' in layer_name:
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
+ padding=v[2])
+ layers.append((layer_name, layer))
+ else:
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
+ kernel_size=v[2], stride=v[3],
+ padding=v[4])
+ layers.append((layer_name, conv2d))
+ if layer_name not in no_relu_layers:
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
+
+ return nn.Sequential(OrderedDict(layers))
+
+class bodypose_model(nn.Module):
+ def __init__(self):
+ super(bodypose_model, self).__init__()
+
+ # these layers have no relu layer
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
+ blocks = {}
+ block0 = OrderedDict([
+ ('conv1_1', [3, 64, 3, 1, 1]),
+ ('conv1_2', [64, 64, 3, 1, 1]),
+ ('pool1_stage1', [2, 2, 0]),
+ ('conv2_1', [64, 128, 3, 1, 1]),
+ ('conv2_2', [128, 128, 3, 1, 1]),
+ ('pool2_stage1', [2, 2, 0]),
+ ('conv3_1', [128, 256, 3, 1, 1]),
+ ('conv3_2', [256, 256, 3, 1, 1]),
+ ('conv3_3', [256, 256, 3, 1, 1]),
+ ('conv3_4', [256, 256, 3, 1, 1]),
+ ('pool3_stage1', [2, 2, 0]),
+ ('conv4_1', [256, 512, 3, 1, 1]),
+ ('conv4_2', [512, 512, 3, 1, 1]),
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
+ ])
+
+
+ # Stage 1
+ block1_1 = OrderedDict([
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
+ ])
+
+ block1_2 = OrderedDict([
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
+ ])
+ blocks['block1_1'] = block1_1
+ blocks['block1_2'] = block1_2
+
+ self.model0 = make_layers(block0, no_relu_layers)
+
+ # Stages 2 - 6
+ for i in range(2, 7):
+ blocks['block%d_1' % i] = OrderedDict([
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+ ])
+
+ blocks['block%d_2' % i] = OrderedDict([
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+ ])
+
+ for k in blocks.keys():
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+ self.model1_1 = blocks['block1_1']
+ self.model2_1 = blocks['block2_1']
+ self.model3_1 = blocks['block3_1']
+ self.model4_1 = blocks['block4_1']
+ self.model5_1 = blocks['block5_1']
+ self.model6_1 = blocks['block6_1']
+
+ self.model1_2 = blocks['block1_2']
+ self.model2_2 = blocks['block2_2']
+ self.model3_2 = blocks['block3_2']
+ self.model4_2 = blocks['block4_2']
+ self.model5_2 = blocks['block5_2']
+ self.model6_2 = blocks['block6_2']
+
+
+ def forward(self, x):
+
+ out1 = self.model0(x)
+
+ out1_1 = self.model1_1(out1)
+ out1_2 = self.model1_2(out1)
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+ out2_1 = self.model2_1(out2)
+ out2_2 = self.model2_2(out2)
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+ out3_1 = self.model3_1(out3)
+ out3_2 = self.model3_2(out3)
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+ out4_1 = self.model4_1(out4)
+ out4_2 = self.model4_2(out4)
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+ out5_1 = self.model5_1(out5)
+ out5_2 = self.model5_2(out5)
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+ out6_1 = self.model6_1(out6)
+ out6_2 = self.model6_2(out6)
+
+ return out6_1, out6_2
+
+class handpose_model(nn.Module):
+ def __init__(self):
+ super(handpose_model, self).__init__()
+
+ # these layers have no relu layer
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
+ # stage 1
+ block1_0 = OrderedDict([
+ ('conv1_1', [3, 64, 3, 1, 1]),
+ ('conv1_2', [64, 64, 3, 1, 1]),
+ ('pool1_stage1', [2, 2, 0]),
+ ('conv2_1', [64, 128, 3, 1, 1]),
+ ('conv2_2', [128, 128, 3, 1, 1]),
+ ('pool2_stage1', [2, 2, 0]),
+ ('conv3_1', [128, 256, 3, 1, 1]),
+ ('conv3_2', [256, 256, 3, 1, 1]),
+ ('conv3_3', [256, 256, 3, 1, 1]),
+ ('conv3_4', [256, 256, 3, 1, 1]),
+ ('pool3_stage1', [2, 2, 0]),
+ ('conv4_1', [256, 512, 3, 1, 1]),
+ ('conv4_2', [512, 512, 3, 1, 1]),
+ ('conv4_3', [512, 512, 3, 1, 1]),
+ ('conv4_4', [512, 512, 3, 1, 1]),
+ ('conv5_1', [512, 512, 3, 1, 1]),
+ ('conv5_2', [512, 512, 3, 1, 1]),
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
+ ])
+
+ block1_1 = OrderedDict([
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
+ ])
+
+ blocks = {}
+ blocks['block1_0'] = block1_0
+ blocks['block1_1'] = block1_1
+
+ # stage 2-6
+ for i in range(2, 7):
+ blocks['block%d' % i] = OrderedDict([
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
+ ])
+
+ for k in blocks.keys():
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+ self.model1_0 = blocks['block1_0']
+ self.model1_1 = blocks['block1_1']
+ self.model2 = blocks['block2']
+ self.model3 = blocks['block3']
+ self.model4 = blocks['block4']
+ self.model5 = blocks['block5']
+ self.model6 = blocks['block6']
+
+ def forward(self, x):
+ out1_0 = self.model1_0(x)
+ out1_1 = self.model1_1(out1_0)
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
+ out_stage2 = self.model2(concat_stage2)
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+ out_stage3 = self.model3(concat_stage3)
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+ out_stage4 = self.model4(concat_stage4)
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+ out_stage5 = self.model5(concat_stage5)
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+ out_stage6 = self.model6(concat_stage6)
+ return out_stage6
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3eb2ff24a9aefb9e4a169375a17dc0c5652f723
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/open_pose/util.py
@@ -0,0 +1,390 @@
+import math
+import numpy as np
+import matplotlib
+import cv2
+from typing import List, Tuple, Union
+
+from .body import BodyResult, Keypoint
+
+eps = 0.01
+
+
+def smart_resize(x, s):
+ Ht, Wt = s
+ if x.ndim == 2:
+ Ho, Wo = x.shape
+ Co = 1
+ else:
+ Ho, Wo, Co = x.shape
+ if Co == 3 or Co == 1:
+ k = float(Ht + Wt) / float(Ho + Wo)
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+ else:
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
+
+
+def smart_resize_k(x, fx, fy):
+ if x.ndim == 2:
+ Ho, Wo = x.shape
+ Co = 1
+ else:
+ Ho, Wo, Co = x.shape
+ Ht, Wt = Ho * fy, Wo * fx
+ if Co == 3 or Co == 1:
+ k = float(Ht + Wt) / float(Ho + Wo)
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+ else:
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
+
+
+def padRightDownCorner(img, stride, padValue):
+ h = img.shape[0]
+ w = img.shape[1]
+
+ pad = 4 * [None]
+ pad[0] = 0 # up
+ pad[1] = 0 # left
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
+
+ img_padded = img
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+ return img_padded, pad
+
+
+def transfer(model, model_weights):
+ transfered_model_weights = {}
+ for weights_name in model.state_dict().keys():
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
+ return transfered_model_weights
+
+
+def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint], xinsr_stick_scaling: bool = False) -> np.ndarray:
+ """
+ Draw keypoints and limbs representing body pose on a given canvas.
+
+ Args:
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
+ keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
+ xinsr_stick_scaling (bool): Whether or not scaling stick width for xinsr ControlNet
+
+ Returns:
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
+
+ Note:
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
+ """
+ H, W, C = canvas.shape
+ stickwidth = 4
+ # Ref: https://huggingface.co/xinsir/controlnet-openpose-sdxl-1.0
+ max_side = max(H, W)
+ if xinsr_stick_scaling:
+ stick_scale = 1 if max_side < 500 else min(2 + (max_side // 1000), 7)
+ else:
+ stick_scale = 1
+
+ limbSeq = [
+ [2, 3], [2, 6], [3, 4], [4, 5],
+ [6, 7], [7, 8], [2, 9], [9, 10],
+ [10, 11], [2, 12], [12, 13], [13, 14],
+ [2, 1], [1, 15], [15, 17], [1, 16],
+ [16, 18],
+ ]
+
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+
+ for (k1_index, k2_index), color in zip(limbSeq, colors):
+ keypoint1 = keypoints[k1_index - 1]
+ keypoint2 = keypoints[k2_index - 1]
+
+ if keypoint1 is None or keypoint2 is None:
+ continue
+
+ Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
+ X = np.array([keypoint1.y, keypoint2.y]) * float(H)
+ mX = np.mean(X)
+ mY = np.mean(Y)
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth*stick_scale), int(angle), 0, 360, 1)
+ cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
+
+ for keypoint, color in zip(keypoints, colors):
+ if keypoint is None:
+ continue
+
+ x, y = keypoint.x, keypoint.y
+ x = int(x * W)
+ y = int(y * H)
+ cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
+
+ return canvas
+
+
+def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
+ """
+ Draw keypoints and connections representing hand pose on a given canvas.
+
+ Args:
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
+ or None if no keypoints are present.
+
+ Returns:
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
+
+ Note:
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
+ """
+ if not keypoints:
+ return canvas
+
+ H, W, C = canvas.shape
+
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+
+ for ie, (e1, e2) in enumerate(edges):
+ k1 = keypoints[e1]
+ k2 = keypoints[e2]
+ if k1 is None or k2 is None:
+ continue
+
+ x1 = int(k1.x * W)
+ y1 = int(k1.y * H)
+ x2 = int(k2.x * W)
+ y2 = int(k2.y * H)
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
+
+ for keypoint in keypoints:
+ x, y = keypoint.x, keypoint.y
+ x = int(x * W)
+ y = int(y * H)
+ if x > eps and y > eps:
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+ return canvas
+
+
+def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
+ """
+ Draw keypoints representing face pose on a given canvas.
+
+ Args:
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
+ or None if no keypoints are present.
+
+ Returns:
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
+
+ Note:
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
+ """
+ if not keypoints:
+ return canvas
+
+ H, W, C = canvas.shape
+ for keypoint in keypoints:
+ x, y = keypoint.x, keypoint.y
+ x = int(x * W)
+ y = int(y * H)
+ if x > eps and y > eps:
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
+ return canvas
+
+
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
+ """
+ Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
+
+ Args:
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
+
+ Returns:
+ List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
+ corner of the bounding box, the width (height) of the bounding box, and
+ a boolean flag indicating whether the hand is a left hand (True) or a
+ right hand (False).
+
+ Notes:
+ - The width and height of the bounding boxes are equal since the network requires squared input.
+ - The minimum bounding box size is 20 pixels.
+ """
+ ratioWristElbow = 0.33
+ detect_result = []
+ image_height, image_width = oriImg.shape[0:2]
+
+ keypoints = body.keypoints
+ # right hand: wrist 4, elbow 3, shoulder 2
+ # left hand: wrist 7, elbow 6, shoulder 5
+ left_shoulder = keypoints[5]
+ left_elbow = keypoints[6]
+ left_wrist = keypoints[7]
+ right_shoulder = keypoints[2]
+ right_elbow = keypoints[3]
+ right_wrist = keypoints[4]
+
+ # if any of three not detected
+ has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
+ has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
+ if not (has_left or has_right):
+ return []
+
+ hands = []
+ #left hand
+ if has_left:
+ hands.append([
+ left_shoulder.x, left_shoulder.y,
+ left_elbow.x, left_elbow.y,
+ left_wrist.x, left_wrist.y,
+ True
+ ])
+ # right hand
+ if has_right:
+ hands.append([
+ right_shoulder.x, right_shoulder.y,
+ right_elbow.x, right_elbow.y,
+ right_wrist.x, right_wrist.y,
+ False
+ ])
+
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+ x = x3 + ratioWristElbow * (x3 - x2)
+ y = y3 + ratioWristElbow * (y3 - y2)
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+ # x-y refers to the center --> offset to topLeft point
+ # handRectangle.x -= handRectangle.width / 2.f;
+ # handRectangle.y -= handRectangle.height / 2.f;
+ x -= width / 2
+ y -= width / 2 # width = height
+ # overflow the image
+ if x < 0: x = 0
+ if y < 0: y = 0
+ width1 = width
+ width2 = width
+ if x + width > image_width: width1 = image_width - x
+ if y + width > image_height: width2 = image_height - y
+ width = min(width1, width2)
+ # the max hand box value is 20 pixels
+ if width >= 20:
+ detect_result.append((int(x), int(y), int(width), is_left))
+
+ '''
+ return value: [[x, y, w, True if left hand else False]].
+ width=height since the network require squared input.
+ x, y is the coordinate of top left
+ '''
+ return detect_result
+
+
+# Written by Lvmin
+def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
+ """
+ Detect the face in the input body pose keypoints and calculate the bounding box for the face.
+
+ Args:
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
+
+ Returns:
+ Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
+ bounding box and the width (height) of the bounding box, or None if the
+ face is not detected or the bounding box width is less than 20 pixels.
+
+ Notes:
+ - The width and height of the bounding box are equal.
+ - The minimum bounding box size is 20 pixels.
+ """
+ # left right eye ear 14 15 16 17
+ image_height, image_width = oriImg.shape[0:2]
+
+ keypoints = body.keypoints
+ head = keypoints[0]
+ left_eye = keypoints[14]
+ right_eye = keypoints[15]
+ left_ear = keypoints[16]
+ right_ear = keypoints[17]
+
+ if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
+ return None
+
+ width = 0.0
+ x0, y0 = head.x, head.y
+
+ if left_eye is not None:
+ x1, y1 = left_eye.x, left_eye.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 3.0)
+
+ if right_eye is not None:
+ x1, y1 = right_eye.x, right_eye.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 3.0)
+
+ if left_ear is not None:
+ x1, y1 = left_ear.x, left_ear.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 1.5)
+
+ if right_ear is not None:
+ x1, y1 = right_ear.x, right_ear.y
+ d = max(abs(x0 - x1), abs(y0 - y1))
+ width = max(width, d * 1.5)
+
+ x, y = x0, y0
+
+ x -= width
+ y -= width
+
+ if x < 0:
+ x = 0
+
+ if y < 0:
+ y = 0
+
+ width1 = width * 2
+ width2 = width * 2
+
+ if x + width > image_width:
+ width1 = image_width - x
+
+ if y + width > image_height:
+ width2 = image_height - y
+
+ width = min(width1, width2)
+
+ if width >= 20:
+ return int(x), int(y), int(width)
+ else:
+ return None
+
+
+# get max index of 2d array
+def npmax(array):
+ arrayindex = array.argmax(1)
+ arrayvalue = array.max(1)
+ i = arrayvalue.argmax()
+ j = arrayindex[i]
+ return i, j
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..eb297bb86e63f7bf78f113ad80e68861989d87c2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/LICENSE
@@ -0,0 +1,21 @@
+It is just for research purpose, and commercial use should be contacted with authors first.
+
+Copyright (c) 2021 Zhuo Su
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a45176b1105cdb25caf385f1a639924518d260b1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/__init__.py
@@ -0,0 +1,64 @@
+import os
+import warnings
+
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, nms, resize_image_with_pad, safe_step,common_input_validate, custom_hf_download, HF_MODEL_NAME
+from .model import pidinet
+
+
+class PidiNetDetector:
+ def __init__(self, netNetwork):
+ self.netNetwork = netNetwork
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="table5_pidinet.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ netNetwork = pidinet()
+ netNetwork.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(model_path)['state_dict'].items()})
+ netNetwork.eval()
+
+ return cls(netNetwork)
+
+ def to(self, device):
+ self.netNetwork.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, safe=False, output_type="pil", scribble=False, apply_filter=False, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ detected_map = detected_map[:, :, ::-1].copy()
+ with torch.no_grad():
+ image_pidi = torch.from_numpy(detected_map).float().to(self.device)
+ image_pidi = image_pidi / 255.0
+ image_pidi = rearrange(image_pidi, 'h w c -> 1 c h w')
+ edge = self.netNetwork(image_pidi)[-1]
+ edge = edge.cpu().numpy()
+ if apply_filter:
+ edge = edge > 0.5
+ if safe:
+ edge = safe_step(edge)
+ edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = edge[0, 0]
+
+ if scribble:
+ detected_map = nms(detected_map, 127, 3.0)
+ detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
+ detected_map[detected_map > 4] = 255
+ detected_map[detected_map < 255] = 0
+
+ detected_map = HWC3(remove_pad(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd642007eb924c76a070ec84cc6630f592435db
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/pidi/model.py
@@ -0,0 +1,681 @@
+"""
+Author: Zhuo Su, Wenzhe Liu
+Date: Feb 18, 2021
+"""
+
+import math
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+ """Numpy array to tensor.
+
+ Args:
+ imgs (list[ndarray] | ndarray): Input images.
+ bgr2rgb (bool): Whether to change bgr to rgb.
+ float32 (bool): Whether to change to float32.
+
+ Returns:
+ list[tensor] | tensor: Tensor images. If returned results only have
+ one element, just return tensor.
+ """
+
+ def _totensor(img, bgr2rgb, float32):
+ if img.shape[2] == 3 and bgr2rgb:
+ if img.dtype == 'float64':
+ img = img.astype('float32')
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+ img = torch.from_numpy(img.transpose(2, 0, 1))
+ if float32:
+ img = img.float()
+ return img
+
+ if isinstance(imgs, list):
+ return [_totensor(img, bgr2rgb, float32) for img in imgs]
+ else:
+ return _totensor(imgs, bgr2rgb, float32)
+
+nets = {
+ 'baseline': {
+ 'layer0': 'cv',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'c-v15': {
+ 'layer0': 'cd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'a-v15': {
+ 'layer0': 'ad',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'r-v15': {
+ 'layer0': 'rd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'cvvv4': {
+ 'layer0': 'cd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cd',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cd',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cd',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'avvv4': {
+ 'layer0': 'ad',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'ad',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'ad',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'ad',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'rvvv4': {
+ 'layer0': 'rd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'rd',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'rd',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'rd',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'cccv4': {
+ 'layer0': 'cd',
+ 'layer1': 'cd',
+ 'layer2': 'cd',
+ 'layer3': 'cv',
+ 'layer4': 'cd',
+ 'layer5': 'cd',
+ 'layer6': 'cd',
+ 'layer7': 'cv',
+ 'layer8': 'cd',
+ 'layer9': 'cd',
+ 'layer10': 'cd',
+ 'layer11': 'cv',
+ 'layer12': 'cd',
+ 'layer13': 'cd',
+ 'layer14': 'cd',
+ 'layer15': 'cv',
+ },
+ 'aaav4': {
+ 'layer0': 'ad',
+ 'layer1': 'ad',
+ 'layer2': 'ad',
+ 'layer3': 'cv',
+ 'layer4': 'ad',
+ 'layer5': 'ad',
+ 'layer6': 'ad',
+ 'layer7': 'cv',
+ 'layer8': 'ad',
+ 'layer9': 'ad',
+ 'layer10': 'ad',
+ 'layer11': 'cv',
+ 'layer12': 'ad',
+ 'layer13': 'ad',
+ 'layer14': 'ad',
+ 'layer15': 'cv',
+ },
+ 'rrrv4': {
+ 'layer0': 'rd',
+ 'layer1': 'rd',
+ 'layer2': 'rd',
+ 'layer3': 'cv',
+ 'layer4': 'rd',
+ 'layer5': 'rd',
+ 'layer6': 'rd',
+ 'layer7': 'cv',
+ 'layer8': 'rd',
+ 'layer9': 'rd',
+ 'layer10': 'rd',
+ 'layer11': 'cv',
+ 'layer12': 'rd',
+ 'layer13': 'rd',
+ 'layer14': 'rd',
+ 'layer15': 'cv',
+ },
+ 'c16': {
+ 'layer0': 'cd',
+ 'layer1': 'cd',
+ 'layer2': 'cd',
+ 'layer3': 'cd',
+ 'layer4': 'cd',
+ 'layer5': 'cd',
+ 'layer6': 'cd',
+ 'layer7': 'cd',
+ 'layer8': 'cd',
+ 'layer9': 'cd',
+ 'layer10': 'cd',
+ 'layer11': 'cd',
+ 'layer12': 'cd',
+ 'layer13': 'cd',
+ 'layer14': 'cd',
+ 'layer15': 'cd',
+ },
+ 'a16': {
+ 'layer0': 'ad',
+ 'layer1': 'ad',
+ 'layer2': 'ad',
+ 'layer3': 'ad',
+ 'layer4': 'ad',
+ 'layer5': 'ad',
+ 'layer6': 'ad',
+ 'layer7': 'ad',
+ 'layer8': 'ad',
+ 'layer9': 'ad',
+ 'layer10': 'ad',
+ 'layer11': 'ad',
+ 'layer12': 'ad',
+ 'layer13': 'ad',
+ 'layer14': 'ad',
+ 'layer15': 'ad',
+ },
+ 'r16': {
+ 'layer0': 'rd',
+ 'layer1': 'rd',
+ 'layer2': 'rd',
+ 'layer3': 'rd',
+ 'layer4': 'rd',
+ 'layer5': 'rd',
+ 'layer6': 'rd',
+ 'layer7': 'rd',
+ 'layer8': 'rd',
+ 'layer9': 'rd',
+ 'layer10': 'rd',
+ 'layer11': 'rd',
+ 'layer12': 'rd',
+ 'layer13': 'rd',
+ 'layer14': 'rd',
+ 'layer15': 'rd',
+ },
+ 'carv4': {
+ 'layer0': 'cd',
+ 'layer1': 'ad',
+ 'layer2': 'rd',
+ 'layer3': 'cv',
+ 'layer4': 'cd',
+ 'layer5': 'ad',
+ 'layer6': 'rd',
+ 'layer7': 'cv',
+ 'layer8': 'cd',
+ 'layer9': 'ad',
+ 'layer10': 'rd',
+ 'layer11': 'cv',
+ 'layer12': 'cd',
+ 'layer13': 'ad',
+ 'layer14': 'rd',
+ 'layer15': 'cv',
+ },
+ }
+
+def createConvFunc(op_type):
+ assert op_type in ['cv', 'cd', 'ad', 'rd'], 'unknown op type: %s' % str(op_type)
+ if op_type == 'cv':
+ return F.conv2d
+
+ if op_type == 'cd':
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+ assert dilation in [1, 2], 'dilation for cd_conv should be in 1 or 2'
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for cd_conv should be 3x3'
+ assert padding == dilation, 'padding for cd_conv set wrong'
+
+ weights_c = weights.sum(dim=[2, 3], keepdim=True)
+ yc = F.conv2d(x, weights_c, stride=stride, padding=0, groups=groups)
+ y = F.conv2d(x, weights, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return y - yc
+ return func
+ elif op_type == 'ad':
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+ assert dilation in [1, 2], 'dilation for ad_conv should be in 1 or 2'
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for ad_conv should be 3x3'
+ assert padding == dilation, 'padding for ad_conv set wrong'
+
+ shape = weights.shape
+ weights = weights.view(shape[0], shape[1], -1)
+ weights_conv = (weights - weights[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]).view(shape) # clock-wise
+ y = F.conv2d(x, weights_conv, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return y
+ return func
+ elif op_type == 'rd':
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+ assert dilation in [1, 2], 'dilation for rd_conv should be in 1 or 2'
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for rd_conv should be 3x3'
+ padding = 2 * dilation
+
+ shape = weights.shape
+ if weights.is_cuda:
+ buffer = torch.cuda.FloatTensor(shape[0], shape[1], 5 * 5).fill_(0)
+ else:
+ buffer = torch.zeros(shape[0], shape[1], 5 * 5).to(weights.device)
+ weights = weights.view(shape[0], shape[1], -1)
+ buffer[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = weights[:, :, 1:]
+ buffer[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -weights[:, :, 1:]
+ buffer[:, :, 12] = 0
+ buffer = buffer.view(shape[0], shape[1], 5, 5)
+ y = F.conv2d(x, buffer, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return y
+ return func
+ else:
+ print('impossible to be here unless you force that')
+ return None
+
+class Conv2d(nn.Module):
+ def __init__(self, pdc, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
+ super(Conv2d, self).__init__()
+ if in_channels % groups != 0:
+ raise ValueError('in_channels must be divisible by groups')
+ if out_channels % groups != 0:
+ raise ValueError('out_channels must be divisible by groups')
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.padding = padding
+ self.dilation = dilation
+ self.groups = groups
+ self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size))
+ if bias:
+ self.bias = nn.Parameter(torch.Tensor(out_channels))
+ else:
+ self.register_parameter('bias', None)
+ self.reset_parameters()
+ self.pdc = pdc
+
+ def reset_parameters(self):
+ nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+ if self.bias is not None:
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+ bound = 1 / math.sqrt(fan_in)
+ nn.init.uniform_(self.bias, -bound, bound)
+
+ def forward(self, input):
+
+ return self.pdc(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+class CSAM(nn.Module):
+ """
+ Compact Spatial Attention Module
+ """
+ def __init__(self, channels):
+ super(CSAM, self).__init__()
+
+ mid_channels = 4
+ self.relu1 = nn.ReLU()
+ self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, padding=0)
+ self.conv2 = nn.Conv2d(mid_channels, 1, kernel_size=3, padding=1, bias=False)
+ self.sigmoid = nn.Sigmoid()
+ nn.init.constant_(self.conv1.bias, 0)
+
+ def forward(self, x):
+ y = self.relu1(x)
+ y = self.conv1(y)
+ y = self.conv2(y)
+ y = self.sigmoid(y)
+
+ return x * y
+
+class CDCM(nn.Module):
+ """
+ Compact Dilation Convolution based Module
+ """
+ def __init__(self, in_channels, out_channels):
+ super(CDCM, self).__init__()
+
+ self.relu1 = nn.ReLU()
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+ self.conv2_1 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=5, padding=5, bias=False)
+ self.conv2_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=7, padding=7, bias=False)
+ self.conv2_3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=9, padding=9, bias=False)
+ self.conv2_4 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=11, padding=11, bias=False)
+ nn.init.constant_(self.conv1.bias, 0)
+
+ def forward(self, x):
+ x = self.relu1(x)
+ x = self.conv1(x)
+ x1 = self.conv2_1(x)
+ x2 = self.conv2_2(x)
+ x3 = self.conv2_3(x)
+ x4 = self.conv2_4(x)
+ return x1 + x2 + x3 + x4
+
+
+class MapReduce(nn.Module):
+ """
+ Reduce feature maps into a single edge map
+ """
+ def __init__(self, channels):
+ super(MapReduce, self).__init__()
+ self.conv = nn.Conv2d(channels, 1, kernel_size=1, padding=0)
+ nn.init.constant_(self.conv.bias, 0)
+
+ def forward(self, x):
+ return self.conv(x)
+
+
+class PDCBlock(nn.Module):
+ def __init__(self, pdc, inplane, ouplane, stride=1):
+ super(PDCBlock, self).__init__()
+ self.stride=stride
+
+ self.stride=stride
+ if self.stride > 1:
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
+ self.conv1 = Conv2d(pdc, inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
+ self.relu2 = nn.ReLU()
+ self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
+
+ def forward(self, x):
+ if self.stride > 1:
+ x = self.pool(x)
+ y = self.conv1(x)
+ y = self.relu2(y)
+ y = self.conv2(y)
+ if self.stride > 1:
+ x = self.shortcut(x)
+ y = y + x
+ return y
+
+class PDCBlock_converted(nn.Module):
+ """
+ CPDC, APDC can be converted to vanilla 3x3 convolution
+ RPDC can be converted to vanilla 5x5 convolution
+ """
+ def __init__(self, pdc, inplane, ouplane, stride=1):
+ super(PDCBlock_converted, self).__init__()
+ self.stride=stride
+
+ if self.stride > 1:
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
+ if pdc == 'rd':
+ self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=5, padding=2, groups=inplane, bias=False)
+ else:
+ self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
+ self.relu2 = nn.ReLU()
+ self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
+
+ def forward(self, x):
+ if self.stride > 1:
+ x = self.pool(x)
+ y = self.conv1(x)
+ y = self.relu2(y)
+ y = self.conv2(y)
+ if self.stride > 1:
+ x = self.shortcut(x)
+ y = y + x
+ return y
+
+class PiDiNet(nn.Module):
+ def __init__(self, inplane, pdcs, dil=None, sa=False, convert=False):
+ super(PiDiNet, self).__init__()
+ self.sa = sa
+ if dil is not None:
+ assert isinstance(dil, int), 'dil should be an int'
+ self.dil = dil
+
+ self.fuseplanes = []
+
+ self.inplane = inplane
+ if convert:
+ if pdcs[0] == 'rd':
+ init_kernel_size = 5
+ init_padding = 2
+ else:
+ init_kernel_size = 3
+ init_padding = 1
+ self.init_block = nn.Conv2d(3, self.inplane,
+ kernel_size=init_kernel_size, padding=init_padding, bias=False)
+ block_class = PDCBlock_converted
+ else:
+ self.init_block = Conv2d(pdcs[0], 3, self.inplane, kernel_size=3, padding=1)
+ block_class = PDCBlock
+
+ self.block1_1 = block_class(pdcs[1], self.inplane, self.inplane)
+ self.block1_2 = block_class(pdcs[2], self.inplane, self.inplane)
+ self.block1_3 = block_class(pdcs[3], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # C
+
+ inplane = self.inplane
+ self.inplane = self.inplane * 2
+ self.block2_1 = block_class(pdcs[4], inplane, self.inplane, stride=2)
+ self.block2_2 = block_class(pdcs[5], self.inplane, self.inplane)
+ self.block2_3 = block_class(pdcs[6], self.inplane, self.inplane)
+ self.block2_4 = block_class(pdcs[7], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # 2C
+
+ inplane = self.inplane
+ self.inplane = self.inplane * 2
+ self.block3_1 = block_class(pdcs[8], inplane, self.inplane, stride=2)
+ self.block3_2 = block_class(pdcs[9], self.inplane, self.inplane)
+ self.block3_3 = block_class(pdcs[10], self.inplane, self.inplane)
+ self.block3_4 = block_class(pdcs[11], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # 4C
+
+ self.block4_1 = block_class(pdcs[12], self.inplane, self.inplane, stride=2)
+ self.block4_2 = block_class(pdcs[13], self.inplane, self.inplane)
+ self.block4_3 = block_class(pdcs[14], self.inplane, self.inplane)
+ self.block4_4 = block_class(pdcs[15], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # 4C
+
+ self.conv_reduces = nn.ModuleList()
+ if self.sa and self.dil is not None:
+ self.attentions = nn.ModuleList()
+ self.dilations = nn.ModuleList()
+ for i in range(4):
+ self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
+ self.attentions.append(CSAM(self.dil))
+ self.conv_reduces.append(MapReduce(self.dil))
+ elif self.sa:
+ self.attentions = nn.ModuleList()
+ for i in range(4):
+ self.attentions.append(CSAM(self.fuseplanes[i]))
+ self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
+ elif self.dil is not None:
+ self.dilations = nn.ModuleList()
+ for i in range(4):
+ self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
+ self.conv_reduces.append(MapReduce(self.dil))
+ else:
+ for i in range(4):
+ self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
+
+ self.classifier = nn.Conv2d(4, 1, kernel_size=1) # has bias
+ nn.init.constant_(self.classifier.weight, 0.25)
+ nn.init.constant_(self.classifier.bias, 0)
+
+ # print('initialization done')
+
+ def get_weights(self):
+ conv_weights = []
+ bn_weights = []
+ relu_weights = []
+ for pname, p in self.named_parameters():
+ if 'bn' in pname:
+ bn_weights.append(p)
+ elif 'relu' in pname:
+ relu_weights.append(p)
+ else:
+ conv_weights.append(p)
+
+ return conv_weights, bn_weights, relu_weights
+
+ def forward(self, x):
+ H, W = x.size()[2:]
+
+ x = self.init_block(x)
+
+ x1 = self.block1_1(x)
+ x1 = self.block1_2(x1)
+ x1 = self.block1_3(x1)
+
+ x2 = self.block2_1(x1)
+ x2 = self.block2_2(x2)
+ x2 = self.block2_3(x2)
+ x2 = self.block2_4(x2)
+
+ x3 = self.block3_1(x2)
+ x3 = self.block3_2(x3)
+ x3 = self.block3_3(x3)
+ x3 = self.block3_4(x3)
+
+ x4 = self.block4_1(x3)
+ x4 = self.block4_2(x4)
+ x4 = self.block4_3(x4)
+ x4 = self.block4_4(x4)
+
+ x_fuses = []
+ if self.sa and self.dil is not None:
+ for i, xi in enumerate([x1, x2, x3, x4]):
+ x_fuses.append(self.attentions[i](self.dilations[i](xi)))
+ elif self.sa:
+ for i, xi in enumerate([x1, x2, x3, x4]):
+ x_fuses.append(self.attentions[i](xi))
+ elif self.dil is not None:
+ for i, xi in enumerate([x1, x2, x3, x4]):
+ x_fuses.append(self.dilations[i](xi))
+ else:
+ x_fuses = [x1, x2, x3, x4]
+
+ e1 = self.conv_reduces[0](x_fuses[0])
+ e1 = F.interpolate(e1, (H, W), mode="bilinear", align_corners=False)
+
+ e2 = self.conv_reduces[1](x_fuses[1])
+ e2 = F.interpolate(e2, (H, W), mode="bilinear", align_corners=False)
+
+ e3 = self.conv_reduces[2](x_fuses[2])
+ e3 = F.interpolate(e3, (H, W), mode="bilinear", align_corners=False)
+
+ e4 = self.conv_reduces[3](x_fuses[3])
+ e4 = F.interpolate(e4, (H, W), mode="bilinear", align_corners=False)
+
+ outputs = [e1, e2, e3, e4]
+
+ output = self.classifier(torch.cat(outputs, dim=1))
+ #if not self.training:
+ # return torch.sigmoid(output)
+
+ outputs.append(output)
+ outputs = [torch.sigmoid(r) for r in outputs]
+ return outputs
+
+def config_model(model):
+ model_options = list(nets.keys())
+ assert model in model_options, \
+ 'unrecognized model, please choose from %s' % str(model_options)
+
+ # print(str(nets[model]))
+
+ pdcs = []
+ for i in range(16):
+ layer_name = 'layer%d' % i
+ op = nets[model][layer_name]
+ pdcs.append(createConvFunc(op))
+
+ return pdcs
+
+def pidinet():
+ pdcs = config_model('carv4')
+ dil = 24 #if args.dil else None
+ return PiDiNet(60, pdcs, dil=dil, sa=True)
+
+
+if __name__ == '__main__':
+ model = pidinet()
+ ckp = torch.load('table5_pidinet.pth')['state_dict']
+ model.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
+ im = cv2.imread('examples/test_my/cat_v4.png')
+ im = img2tensor(im).unsqueeze(0)/255.
+ res = model(im)[-1]
+ res = res>0.5
+ res = res.float()
+ res = (res[0,0].cpu().data.numpy()*255.).astype(np.uint8)
+ print(res.shape)
+ cv2.imwrite('edge.png', res)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/processor.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c8ab9d4d613da78696d1814628533da4747549b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/processor.py
@@ -0,0 +1,147 @@
+"""
+This file contains a Processor that can be used to process images with controlnet aux processors
+"""
+import io
+import logging
+from typing import Dict, Optional, Union
+
+from PIL import Image
+
+from custom_controlnet_aux import (CannyDetector, ContentShuffleDetector, HEDdetector,
+ LeresDetector, LineartAnimeDetector,
+ LineartDetector, MediapipeFaceDetector,
+ MidasDetector, MLSDdetector, NormalBaeDetector,
+ OpenposeDetector, PidiNetDetector, ZoeDetector, TileDetector)
+
+LOGGER = logging.getLogger(__name__)
+
+
+MODELS = {
+ # checkpoint models
+ 'scribble_hed': {'class': HEDdetector, 'checkpoint': True},
+ 'softedge_hed': {'class': HEDdetector, 'checkpoint': True},
+ 'scribble_hedsafe': {'class': HEDdetector, 'checkpoint': True},
+ 'softedge_hedsafe': {'class': HEDdetector, 'checkpoint': True},
+ 'depth_midas': {'class': MidasDetector, 'checkpoint': True},
+ 'mlsd': {'class': MLSDdetector, 'checkpoint': True},
+ 'openpose': {'class': OpenposeDetector, 'checkpoint': True},
+ 'openpose_face': {'class': OpenposeDetector, 'checkpoint': True},
+ 'openpose_faceonly': {'class': OpenposeDetector, 'checkpoint': True},
+ 'openpose_full': {'class': OpenposeDetector, 'checkpoint': True},
+ 'openpose_hand': {'class': OpenposeDetector, 'checkpoint': True},
+ 'scribble_pidinet': {'class': PidiNetDetector, 'checkpoint': True},
+ 'softedge_pidinet': {'class': PidiNetDetector, 'checkpoint': True},
+ 'scribble_pidsafe': {'class': PidiNetDetector, 'checkpoint': True},
+ 'softedge_pidsafe': {'class': PidiNetDetector, 'checkpoint': True},
+ 'normal_bae': {'class': NormalBaeDetector, 'checkpoint': True},
+ 'lineart_coarse': {'class': LineartDetector, 'checkpoint': True},
+ 'lineart_realistic': {'class': LineartDetector, 'checkpoint': True},
+ 'lineart_anime': {'class': LineartAnimeDetector, 'checkpoint': True},
+ 'depth_zoe': {'class': ZoeDetector, 'checkpoint': True},
+ 'depth_leres': {'class': LeresDetector, 'checkpoint': True},
+ 'depth_leres++': {'class': LeresDetector, 'checkpoint': True},
+ # instantiate
+ 'shuffle': {'class': ContentShuffleDetector, 'checkpoint': False},
+ 'mediapipe_face': {'class': MediapipeFaceDetector, 'checkpoint': False},
+ 'canny': {'class': CannyDetector, 'checkpoint': False},
+ 'tile': {'class': TileDetector, 'checkpoint': False},
+}
+
+
+MODEL_PARAMS = {
+ 'scribble_hed': {'scribble': True},
+ 'softedge_hed': {'scribble': False},
+ 'scribble_hedsafe': {'scribble': True, 'safe': True},
+ 'softedge_hedsafe': {'scribble': False, 'safe': True},
+ 'depth_midas': {},
+ 'mlsd': {},
+ 'openpose': {'include_body': True, 'include_hand': False, 'include_face': False},
+ 'openpose_face': {'include_body': True, 'include_hand': False, 'include_face': True},
+ 'openpose_faceonly': {'include_body': False, 'include_hand': False, 'include_face': True},
+ 'openpose_full': {'include_body': True, 'include_hand': True, 'include_face': True},
+ 'openpose_hand': {'include_body': False, 'include_hand': True, 'include_face': False},
+ 'scribble_pidinet': {'safe': False, 'scribble': True},
+ 'softedge_pidinet': {'safe': False, 'scribble': False},
+ 'scribble_pidsafe': {'safe': True, 'scribble': True},
+ 'softedge_pidsafe': {'safe': True, 'scribble': False},
+ 'normal_bae': {},
+ 'lineart_realistic': {'coarse': False},
+ 'lineart_coarse': {'coarse': True},
+ 'lineart_anime': {},
+ 'canny': {},
+ 'shuffle': {},
+ 'depth_zoe': {},
+ 'depth_leres': {'boost': False},
+ 'depth_leres++': {'boost': True},
+ 'mediapipe_face': {},
+ 'tile': {},
+}
+
+CHOICES = f"Choices for the processor are {list(MODELS.keys())}"
+
+
+class Processor:
+ def __init__(self, processor_id: str, params: Optional[Dict] = None) -> None:
+ """Processor that can be used to process images with controlnet aux processors
+
+ Args:
+ processor_id (str): processor name, options are 'hed, midas, mlsd, openpose,
+ pidinet, normalbae, lineart, lineart_coarse, lineart_anime,
+ canny, content_shuffle, zoe, mediapipe_face, tile'
+ params (Optional[Dict]): parameters for the processor
+ """
+ LOGGER.info("Loading %s".format(processor_id))
+
+ if processor_id not in MODELS:
+ raise ValueError(f"{processor_id} is not a valid processor id. Please make sure to choose one of {', '.join(MODELS.keys())}")
+
+ self.processor_id = processor_id
+ self.processor = self.load_processor(self.processor_id)
+
+ # load default params
+ self.params = MODEL_PARAMS[self.processor_id]
+ # update with user params
+ if params:
+ self.params.update(params)
+
+ def load_processor(self, processor_id: str) -> 'Processor':
+ """Load controlnet aux processors
+
+ Args:
+ processor_id (str): processor name
+
+ Returns:
+ Processor: controlnet aux processor
+ """
+ processor = MODELS[processor_id]['class']
+
+ # check if the proecssor is a checkpoint model
+ if MODELS[processor_id]['checkpoint']:
+ processor = processor.from_pretrained("lllyasviel/Annotators")
+ else:
+ processor = processor()
+ return processor
+
+ def __call__(self, image: Union[Image.Image, bytes],
+ to_pil: bool = True) -> Union[Image.Image, bytes]:
+ """processes an image with a controlnet aux processor
+
+ Args:
+ image (Union[Image.Image, bytes]): input image in bytes or PIL Image
+ to_pil (bool): whether to return bytes or PIL Image
+
+ Returns:
+ Union[Image.Image, bytes]: processed image in bytes or PIL Image
+ """
+ # check if bytes or PIL Image
+ if isinstance(image, bytes):
+ image = Image.open(io.BytesIO(image)).convert("RGB")
+
+ processed_image = self.processor(image, **self.params)
+
+ if to_pil:
+ return processed_image
+ else:
+ output_bytes = io.BytesIO()
+ processed_image.save(output_bytes, format='JPEG')
+ return output_bytes.getvalue()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/pyracanny/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/pyracanny/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f573aae85cd7ed3cd3f4ee07db9975bb32c4d987
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/pyracanny/__init__.py
@@ -0,0 +1,74 @@
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import resize_image_with_pad, common_input_validate, HWC3
+
+def centered_canny(x: np.ndarray, canny_low_threshold, canny_high_threshold):
+ assert isinstance(x, np.ndarray)
+ assert x.ndim == 2 and x.dtype == np.uint8
+
+ y = cv2.Canny(x, int(canny_low_threshold), int(canny_high_threshold))
+ y = y.astype(np.float32) / 255.0
+ return y
+
+def centered_canny_color(x: np.ndarray, canny_low_threshold, canny_high_threshold):
+ assert isinstance(x, np.ndarray)
+ assert x.ndim == 3 and x.shape[2] == 3
+
+ result = [centered_canny(x[..., i], canny_low_threshold, canny_high_threshold) for i in range(3)]
+ result = np.stack(result, axis=2)
+ return result
+
+def pyramid_canny_color(x: np.ndarray, canny_low_threshold, canny_high_threshold):
+ assert isinstance(x, np.ndarray)
+ assert x.ndim == 3 and x.shape[2] == 3
+
+ H, W, C = x.shape
+ acc_edge = None
+
+ for k in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
+ Hs, Ws = int(H * k), int(W * k)
+ small = cv2.resize(x, (Ws, Hs), interpolation=cv2.INTER_AREA)
+ edge = centered_canny_color(small, canny_low_threshold, canny_high_threshold)
+ if acc_edge is None:
+ acc_edge = edge
+ else:
+ acc_edge = cv2.resize(acc_edge, (edge.shape[1], edge.shape[0]), interpolation=cv2.INTER_LINEAR)
+ acc_edge = acc_edge * 0.75 + edge * 0.25
+
+ return acc_edge
+
+def norm255(x, low=4, high=96):
+ assert isinstance(x, np.ndarray)
+ assert x.ndim == 2 and x.dtype == np.float32
+
+ v_min = np.percentile(x, low)
+ v_max = np.percentile(x, high)
+
+ x -= v_min
+ x /= v_max - v_min
+
+ return x * 255.0
+
+def canny_pyramid(x, canny_low_threshold, canny_high_threshold):
+ # For some reasons, SAI's Control-lora Canny seems to be trained on canny maps with non-standard resolutions.
+ # Then we use pyramid to use all resolutions to avoid missing any structure in specific resolutions.
+
+ color_canny = pyramid_canny_color(x, canny_low_threshold, canny_high_threshold)
+ result = np.sum(color_canny, axis=2)
+
+ return norm255(result, low=1, high=99).clip(0, 255).astype(np.uint8)
+
+class PyraCannyDetector:
+ def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ detected_map = canny_pyramid(detected_map, low_threshold, high_threshold)
+ detected_map = HWC3(remove_pad(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/recolor/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/recolor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dec6faf2556363ad0730435a45645758806eb850
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/recolor/__init__.py
@@ -0,0 +1,39 @@
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import resize_image_with_pad, common_input_validate, HWC3
+
+#https://github.com/Mikubill/sd-webui-controlnet/blob/416c345072c9c2066101e225964e3986abe6945e/scripts/processor.py#L639
+def recolor_luminance(img, thr_a=1.0):
+ result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2LAB)
+ result = result[:, :, 0].astype(np.float32) / 255.0
+ result = result ** thr_a
+ result = (result * 255.0).clip(0, 255).astype(np.uint8)
+ result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
+ return result
+
+
+def recolor_intensity(img, thr_a=1.0):
+ result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2HSV)
+ result = result[:, :, 2].astype(np.float32) / 255.0
+ result = result ** thr_a
+ result = (result * 255.0).clip(0, 255).astype(np.uint8)
+ result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
+ return result
+
+recolor_methods = {
+ "luminance": recolor_luminance,
+ "intensity": recolor_intensity
+}
+
+class Recolorizer:
+ def __call__(self, input_image=None, mode="luminance", gamma_correction=1.0, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+ assert mode in recolor_methods.keys()
+ detected_map = recolor_methods[mode](input_image, gamma_correction)
+ detected_map = HWC3(remove_pad(detected_map))
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b8d6865e494fc8318ea9364a909538ffe2bf2a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import warnings
+from typing import Union
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, SAM_MODEL_NAME
+from .automatic_mask_generator import SamAutomaticMaskGenerator
+from .build_sam import sam_model_registry
+
+
+class SamDetector:
+ def __init__(self, mask_generator: SamAutomaticMaskGenerator):
+ self.mask_generator = mask_generator
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=SAM_MODEL_NAME, model_type="vit_t", filename="mobile_sam.pt", subfolder=None):
+ """
+ Possible model_type : vit_h, vit_l, vit_b, vit_t
+ download weights from https://github.com/facebookresearch/segment-anything
+ """
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ sam = sam_model_registry[model_type](checkpoint=model_path)
+ mask_generator = SamAutomaticMaskGenerator(sam)
+
+ return cls(mask_generator)
+
+ def to(self, device):
+ model = self.mask_generator.predictor.model.to(device)
+ model.train(False) #Update attention_bias in https://github.com/Fannovel16/comfyui_controlnet_aux/blob/main/src/custom_controlnet_aux/segment_anything/modeling/tiny_vit_sam.py#L251
+ self.mask_generator = SamAutomaticMaskGenerator(model)
+ return self
+
+
+ def show_anns(self, anns):
+ if len(anns) == 0:
+ return
+ sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+ h, w = anns[0]['segmentation'].shape
+ final_img = Image.fromarray(np.zeros((h, w, 3), dtype=np.uint8), mode="RGB")
+ for ann in sorted_anns:
+ m = ann['segmentation']
+ img = np.empty((m.shape[0], m.shape[1], 3), dtype=np.uint8)
+ for i in range(3):
+ img[:,:,i] = np.random.randint(255, dtype=np.uint8)
+ final_img.paste(Image.fromarray(img, mode="RGB"), (0, 0), Image.fromarray(np.uint8(m*255)))
+
+ return np.array(final_img, dtype=np.uint8)
+
+ def __call__(self, input_image: Union[np.ndarray, Image.Image]=None, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", **kwargs) -> Image.Image:
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ # Generate Masks
+ masks = self.mask_generator.generate(input_image)
+ # Create map
+ map = self.show_anns(masks)
+
+ detected_map = HWC3(remove_pad(map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/automatic_mask_generator.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed13dafb9c0831e22bb0122c0133cf1d8bfcbdf2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/automatic_mask_generator.py
@@ -0,0 +1,372 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from .modeling import Sam
+from .predictor import SamPredictor
+from .utils.amg import (
+ MaskData,
+ area_from_rle,
+ batch_iterator,
+ batched_mask_to_box,
+ box_xyxy_to_xywh,
+ build_all_layer_point_grids,
+ calculate_stability_score,
+ coco_encode_rle,
+ generate_crop_boxes,
+ is_box_near_crop_edge,
+ mask_to_rle_pytorch,
+ remove_small_regions,
+ rle_to_mask,
+ uncrop_boxes_xyxy,
+ uncrop_masks,
+ uncrop_points,
+)
+
+
+class SamAutomaticMaskGenerator:
+ def __init__(
+ self,
+ model: Sam,
+ points_per_side: Optional[int] = 32,
+ points_per_batch: int = 64,
+ pred_iou_thresh: float = 0.88,
+ stability_score_thresh: float = 0.95,
+ stability_score_offset: float = 1.0,
+ box_nms_thresh: float = 0.7,
+ crop_n_layers: int = 0,
+ crop_nms_thresh: float = 0.7,
+ crop_overlap_ratio: float = 512 / 1500,
+ crop_n_points_downscale_factor: int = 1,
+ point_grids: Optional[List[np.ndarray]] = None,
+ min_mask_region_area: int = 0,
+ output_mode: str = "binary_mask",
+ ) -> None:
+ """
+ Using a SAM model, generates masks for the entire image.
+ Generates a grid of point prompts over the image, then filters
+ low quality and duplicate masks. The default settings are chosen
+ for SAM with a ViT-H backbone.
+
+ Arguments:
+ model (Sam): The SAM model to use for mask prediction.
+ points_per_side (int or None): The number of points to be sampled
+ along one side of the image. The total number of points is
+ points_per_side**2. If None, 'point_grids' must provide explicit
+ point sampling.
+ points_per_batch (int): Sets the number of points run simultaneously
+ by the model. Higher numbers may be faster but use more GPU memory.
+ pred_iou_thresh (float): A filtering threshold in [0,1], using the
+ model's predicted mask quality.
+ stability_score_thresh (float): A filtering threshold in [0,1], using
+ the stability of the mask under changes to the cutoff used to binarize
+ the model's mask predictions.
+ stability_score_offset (float): The amount to shift the cutoff when
+ calculated the stability score.
+ box_nms_thresh (float): The box IoU cutoff used by non-maximal
+ suppression to filter duplicate masks.
+ crop_n_layers (int): If >0, mask prediction will be run again on
+ crops of the image. Sets the number of layers to run, where each
+ layer has 2**i_layer number of image crops.
+ crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+ suppression to filter duplicate masks between different crops.
+ crop_overlap_ratio (float): Sets the degree to which crops overlap.
+ In the first crop layer, crops will overlap by this fraction of
+ the image length. Later layers with more crops scale down this overlap.
+ crop_n_points_downscale_factor (int): The number of points-per-side
+ sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+ point_grids (list(np.ndarray) or None): A list over explicit grids
+ of points used for sampling, normalized to [0,1]. The nth grid in the
+ list is used in the nth crop layer. Exclusive with points_per_side.
+ min_mask_region_area (int): If >0, postprocessing will be applied
+ to remove disconnected regions and holes in masks with area smaller
+ than min_mask_region_area. Requires opencv.
+ output_mode (str): The form masks are returned in. Can be 'binary_mask',
+ 'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+ For large resolutions, 'binary_mask' may consume large amounts of
+ memory.
+ """
+
+ assert (points_per_side is None) != (
+ point_grids is None
+ ), "Exactly one of points_per_side or point_grid must be provided."
+ if points_per_side is not None:
+ self.point_grids = build_all_layer_point_grids(
+ points_per_side,
+ crop_n_layers,
+ crop_n_points_downscale_factor,
+ )
+ elif point_grids is not None:
+ self.point_grids = point_grids
+ else:
+ raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+ assert output_mode in [
+ "binary_mask",
+ "uncompressed_rle",
+ "coco_rle",
+ ], f"Unknown output_mode {output_mode}."
+ if output_mode == "coco_rle":
+ from custom_pycocotools import mask as mask_utils # type: ignore # noqa: F401
+
+ if min_mask_region_area > 0:
+ import cv2 # type: ignore # noqa: F401
+
+ self.predictor = SamPredictor(model)
+ self.points_per_batch = points_per_batch
+ self.pred_iou_thresh = pred_iou_thresh
+ self.stability_score_thresh = stability_score_thresh
+ self.stability_score_offset = stability_score_offset
+ self.box_nms_thresh = box_nms_thresh
+ self.crop_n_layers = crop_n_layers
+ self.crop_nms_thresh = crop_nms_thresh
+ self.crop_overlap_ratio = crop_overlap_ratio
+ self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+ self.min_mask_region_area = min_mask_region_area
+ self.output_mode = output_mode
+
+ @torch.no_grad()
+ def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+ """
+ Generates masks for the given image.
+
+ Arguments:
+ image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+ Returns:
+ list(dict(str, any)): A list over records for masks. Each record is
+ a dict containing the following keys:
+ segmentation (dict(str, any) or np.ndarray): The mask. If
+ output_mode='binary_mask', is an array of shape HW. Otherwise,
+ is a dictionary containing the RLE.
+ bbox (list(float)): The box around the mask, in XYWH format.
+ area (int): The area in pixels of the mask.
+ predicted_iou (float): The model's own prediction of the mask's
+ quality. This is filtered by the pred_iou_thresh parameter.
+ point_coords (list(list(float))): The point coordinates input
+ to the model to generate this mask.
+ stability_score (float): A measure of the mask's quality. This
+ is filtered on using the stability_score_thresh parameter.
+ crop_box (list(float)): The crop of the image used to generate
+ the mask, given in XYWH format.
+ """
+
+ # Generate masks
+ mask_data = self._generate_masks(image)
+
+ # Filter small disconnected regions and holes in masks
+ if self.min_mask_region_area > 0:
+ mask_data = self.postprocess_small_regions(
+ mask_data,
+ self.min_mask_region_area,
+ max(self.box_nms_thresh, self.crop_nms_thresh),
+ )
+
+ # Encode masks
+ if self.output_mode == "coco_rle":
+ mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+ elif self.output_mode == "binary_mask":
+ mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+ else:
+ mask_data["segmentations"] = mask_data["rles"]
+
+ # Write mask records
+ curr_anns = []
+ for idx in range(len(mask_data["segmentations"])):
+ ann = {
+ "segmentation": mask_data["segmentations"][idx],
+ "area": area_from_rle(mask_data["rles"][idx]),
+ "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+ "predicted_iou": mask_data["iou_preds"][idx].item(),
+ "point_coords": [mask_data["points"][idx].tolist()],
+ "stability_score": mask_data["stability_score"][idx].item(),
+ "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+ }
+ curr_anns.append(ann)
+
+ return curr_anns
+
+ def _generate_masks(self, image: np.ndarray) -> MaskData:
+ orig_size = image.shape[:2]
+ crop_boxes, layer_idxs = generate_crop_boxes(
+ orig_size, self.crop_n_layers, self.crop_overlap_ratio
+ )
+
+ # Iterate over image crops
+ data = MaskData()
+ for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+ crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+ data.cat(crop_data)
+
+ # Remove duplicate masks between crops
+ if len(crop_boxes) > 1:
+ # Prefer masks from smaller crops
+ scores = 1 / box_area(data["crop_boxes"])
+ scores = scores.to(data["boxes"].device)
+ keep_by_nms = batched_nms(
+ data["boxes"].float(),
+ scores,
+ torch.zeros_like(data["boxes"][:, 0]), # categories
+ iou_threshold=self.crop_nms_thresh,
+ )
+ data.filter(keep_by_nms)
+
+ data.to_numpy()
+ return data
+
+ def _process_crop(
+ self,
+ image: np.ndarray,
+ crop_box: List[int],
+ crop_layer_idx: int,
+ orig_size: Tuple[int, ...],
+ ) -> MaskData:
+ # Crop the image and calculate embeddings
+ x0, y0, x1, y1 = crop_box
+ cropped_im = image[y0:y1, x0:x1, :]
+ cropped_im_size = cropped_im.shape[:2]
+ self.predictor.set_image(cropped_im)
+
+ # Get points for this crop
+ points_scale = np.array(cropped_im_size)[None, ::-1]
+ points_for_image = self.point_grids[crop_layer_idx] * points_scale
+
+ # Generate masks for this crop in batches
+ data = MaskData()
+ for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+ batch_data = self._process_batch(points, cropped_im_size, crop_box, orig_size)
+ data.cat(batch_data)
+ del batch_data
+ self.predictor.reset_image()
+
+ # Remove duplicates within this crop.
+ keep_by_nms = batched_nms(
+ data["boxes"].float(),
+ data["iou_preds"],
+ torch.zeros_like(data["boxes"][:, 0]), # categories
+ iou_threshold=self.box_nms_thresh,
+ )
+ data.filter(keep_by_nms)
+
+ # Return to the original image frame
+ data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+ data["points"] = uncrop_points(data["points"], crop_box)
+ data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+ return data
+
+ def _process_batch(
+ self,
+ points: np.ndarray,
+ im_size: Tuple[int, ...],
+ crop_box: List[int],
+ orig_size: Tuple[int, ...],
+ ) -> MaskData:
+ orig_h, orig_w = orig_size
+
+ # Run model on this batch
+ transformed_points = self.predictor.transform.apply_coords(points, im_size)
+ in_points = torch.as_tensor(transformed_points, device=self.predictor.device)
+ in_labels = torch.ones(in_points.shape[0], dtype=torch.int, device=in_points.device)
+ masks, iou_preds, _ = self.predictor.predict_torch(
+ in_points[:, None, :],
+ in_labels[:, None],
+ multimask_output=True,
+ return_logits=True,
+ )
+
+ # Serialize predictions and store in MaskData
+ data = MaskData(
+ masks=masks.flatten(0, 1),
+ iou_preds=iou_preds.flatten(0, 1),
+ points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+ )
+ del masks
+
+ # Filter by predicted IoU
+ if self.pred_iou_thresh > 0.0:
+ keep_mask = data["iou_preds"] > self.pred_iou_thresh
+ data.filter(keep_mask)
+
+ # Calculate stability score
+ data["stability_score"] = calculate_stability_score(
+ data["masks"], self.predictor.model.mask_threshold, self.stability_score_offset
+ )
+ if self.stability_score_thresh > 0.0:
+ keep_mask = data["stability_score"] >= self.stability_score_thresh
+ data.filter(keep_mask)
+
+ # Threshold masks and calculate boxes
+ data["masks"] = data["masks"] > self.predictor.model.mask_threshold
+ data["boxes"] = batched_mask_to_box(data["masks"])
+
+ # Filter boxes that touch crop boundaries
+ keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+ if not torch.all(keep_mask):
+ data.filter(keep_mask)
+
+ # Compress to RLE
+ data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+ data["rles"] = mask_to_rle_pytorch(data["masks"])
+ del data["masks"]
+
+ return data
+
+ @staticmethod
+ def postprocess_small_regions(
+ mask_data: MaskData, min_area: int, nms_thresh: float
+ ) -> MaskData:
+ """
+ Removes small disconnected regions and holes in masks, then reruns
+ box NMS to remove any new duplicates.
+
+ Edits mask_data in place.
+
+ Requires open-cv as a dependency.
+ """
+ if len(mask_data["rles"]) == 0:
+ return mask_data
+
+ # Filter small disconnected regions and holes
+ new_masks = []
+ scores = []
+ for rle in mask_data["rles"]:
+ mask = rle_to_mask(rle)
+
+ mask, changed = remove_small_regions(mask, min_area, mode="holes")
+ unchanged = not changed
+ mask, changed = remove_small_regions(mask, min_area, mode="islands")
+ unchanged = unchanged and not changed
+
+ new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+ # Give score=0 to changed masks and score=1 to unchanged masks
+ # so NMS will prefer ones that didn't need postprocessing
+ scores.append(float(unchanged))
+
+ # Recalculate boxes and remove any new duplicates
+ masks = torch.cat(new_masks, dim=0)
+ boxes = batched_mask_to_box(masks)
+ keep_by_nms = batched_nms(
+ boxes.float(),
+ torch.as_tensor(scores),
+ torch.zeros_like(boxes[:, 0]), # categories
+ iou_threshold=nms_thresh,
+ )
+
+ # Only recalculate RLEs for masks that have changed
+ for i_mask in keep_by_nms:
+ if scores[i_mask] == 0.0:
+ mask_torch = masks[i_mask].unsqueeze(0)
+ mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+ mask_data["boxes"][i_mask] = boxes[i_mask] # update res directly
+ mask_data.filter(keep_by_nms)
+
+ return mask_data
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/build_sam.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/build_sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..603d6b40568f7ff614891a3257105fffafa14d6f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/build_sam.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from functools import partial
+
+from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer, TinyViT
+
+
+def build_sam_vit_h(checkpoint=None):
+ return _build_sam(
+ encoder_embed_dim=1280,
+ encoder_depth=32,
+ encoder_num_heads=16,
+ encoder_global_attn_indexes=[7, 15, 23, 31],
+ checkpoint=checkpoint,
+ )
+
+
+build_sam = build_sam_vit_h
+
+
+def build_sam_vit_l(checkpoint=None):
+ return _build_sam(
+ encoder_embed_dim=1024,
+ encoder_depth=24,
+ encoder_num_heads=16,
+ encoder_global_attn_indexes=[5, 11, 17, 23],
+ checkpoint=checkpoint,
+ )
+
+
+def build_sam_vit_b(checkpoint=None):
+ return _build_sam(
+ encoder_embed_dim=768,
+ encoder_depth=12,
+ encoder_num_heads=12,
+ encoder_global_attn_indexes=[2, 5, 8, 11],
+ checkpoint=checkpoint,
+ )
+
+
+def build_sam_vit_t(checkpoint=None):
+ prompt_embed_dim = 256
+ image_size = 1024
+ vit_patch_size = 16
+ image_embedding_size = image_size // vit_patch_size
+ mobile_sam = Sam(
+ image_encoder=TinyViT(img_size=1024, in_chans=3, num_classes=1000,
+ embed_dims=[64, 128, 160, 320],
+ depths=[2, 2, 6, 2],
+ num_heads=[2, 4, 5, 10],
+ window_sizes=[7, 7, 14, 7],
+ mlp_ratio=4.,
+ drop_rate=0.,
+ drop_path_rate=0.0,
+ use_checkpoint=False,
+ mbconv_expand_ratio=4.0,
+ local_conv_size=3,
+ layer_lr_decay=0.8
+ ),
+ prompt_encoder=PromptEncoder(
+ embed_dim=prompt_embed_dim,
+ image_embedding_size=(image_embedding_size, image_embedding_size),
+ input_image_size=(image_size, image_size),
+ mask_in_chans=16,
+ ),
+ mask_decoder=MaskDecoder(
+ num_multimask_outputs=3,
+ transformer=TwoWayTransformer(
+ depth=2,
+ embedding_dim=prompt_embed_dim,
+ mlp_dim=2048,
+ num_heads=8,
+ ),
+ transformer_dim=prompt_embed_dim,
+ iou_head_depth=3,
+ iou_head_hidden_dim=256,
+ ),
+ pixel_mean=[123.675, 116.28, 103.53],
+ pixel_std=[58.395, 57.12, 57.375],
+ )
+
+ mobile_sam.eval()
+ if checkpoint is not None:
+ with open(checkpoint, "rb") as f:
+ state_dict = torch.load(f)
+ mobile_sam.load_state_dict(state_dict)
+ return mobile_sam
+
+
+sam_model_registry = {
+ "default": build_sam_vit_h,
+ "vit_h": build_sam_vit_h,
+ "vit_l": build_sam_vit_l,
+ "vit_b": build_sam_vit_b,
+ "vit_t": build_sam_vit_t,
+}
+
+
+def _build_sam(
+ encoder_embed_dim,
+ encoder_depth,
+ encoder_num_heads,
+ encoder_global_attn_indexes,
+ checkpoint=None,
+):
+ prompt_embed_dim = 256
+ image_size = 1024
+ vit_patch_size = 16
+ image_embedding_size = image_size // vit_patch_size
+ sam = Sam(
+ image_encoder=ImageEncoderViT(
+ depth=encoder_depth,
+ embed_dim=encoder_embed_dim,
+ img_size=image_size,
+ mlp_ratio=4,
+ norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+ num_heads=encoder_num_heads,
+ patch_size=vit_patch_size,
+ qkv_bias=True,
+ use_rel_pos=True,
+ global_attn_indexes=encoder_global_attn_indexes,
+ window_size=14,
+ out_chans=prompt_embed_dim,
+ ),
+ prompt_encoder=PromptEncoder(
+ embed_dim=prompt_embed_dim,
+ image_embedding_size=(image_embedding_size, image_embedding_size),
+ input_image_size=(image_size, image_size),
+ mask_in_chans=16,
+ ),
+ mask_decoder=MaskDecoder(
+ num_multimask_outputs=3,
+ transformer=TwoWayTransformer(
+ depth=2,
+ embedding_dim=prompt_embed_dim,
+ mlp_dim=2048,
+ num_heads=8,
+ ),
+ transformer_dim=prompt_embed_dim,
+ iou_head_depth=3,
+ iou_head_hidden_dim=256,
+ ),
+ pixel_mean=[123.675, 116.28, 103.53],
+ pixel_std=[58.395, 57.12, 57.375],
+ )
+ sam.eval()
+ if checkpoint is not None:
+ with open(checkpoint, "rb") as f:
+ state_dict = torch.load(f)
+ sam.load_state_dict(state_dict)
+ return sam
+
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ff1a5d6ea19a7d2361c596345a623cbd6791d48
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .sam import Sam
+from .image_encoder import ImageEncoderViT
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .transformer import TwoWayTransformer
+from .tiny_vit_sam import TinyViT
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/common.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c92073d1fd6a44d9a7f3abb9ab610d3ccbcac12
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/common.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from typing import Type
+
+
+class MLPBlock(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ mlp_dim: int,
+ act: Type[nn.Module] = nn.GELU,
+ ) -> None:
+ super().__init__()
+ self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+ self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+ self.act = act()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
+class LayerNorm2d(nn.Module):
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(num_channels))
+ self.bias = nn.Parameter(torch.zeros(num_channels))
+ self.eps = eps
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/image_encoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/image_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e74d81fd0bd8e7c33c3e323ba16ab81f37a779b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/image_encoder.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional, Tuple, Type
+
+from .common import LayerNorm2d, MLPBlock
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+ def __init__(
+ self,
+ img_size: int = 1024,
+ patch_size: int = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ depth: int = 12,
+ num_heads: int = 12,
+ mlp_ratio: float = 4.0,
+ out_chans: int = 256,
+ qkv_bias: bool = True,
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
+ act_layer: Type[nn.Module] = nn.GELU,
+ use_abs_pos: bool = True,
+ use_rel_pos: bool = False,
+ rel_pos_zero_init: bool = True,
+ window_size: int = 0,
+ global_attn_indexes: Tuple[int, ...] = (),
+ ) -> None:
+ """
+ Args:
+ img_size (int): Input image size.
+ patch_size (int): Patch size.
+ in_chans (int): Number of input image channels.
+ embed_dim (int): Patch embedding dimension.
+ depth (int): Depth of ViT.
+ num_heads (int): Number of attention heads in each ViT block.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
+ norm_layer (nn.Module): Normalization layer.
+ act_layer (nn.Module): Activation layer.
+ use_abs_pos (bool): If True, use absolute positional embeddings.
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+ window_size (int): Window size for window attention blocks.
+ global_attn_indexes (list): Indexes for blocks using global attention.
+ """
+ super().__init__()
+ self.img_size = img_size
+
+ self.patch_embed = PatchEmbed(
+ kernel_size=(patch_size, patch_size),
+ stride=(patch_size, patch_size),
+ in_chans=in_chans,
+ embed_dim=embed_dim,
+ )
+
+ self.pos_embed: Optional[nn.Parameter] = None
+ if use_abs_pos:
+ # Initialize absolute positional embedding with pretrain image size.
+ self.pos_embed = nn.Parameter(
+ torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
+ )
+
+ self.blocks = nn.ModuleList()
+ for i in range(depth):
+ block = Block(
+ dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ norm_layer=norm_layer,
+ act_layer=act_layer,
+ use_rel_pos=use_rel_pos,
+ rel_pos_zero_init=rel_pos_zero_init,
+ window_size=window_size if i not in global_attn_indexes else 0,
+ input_size=(img_size // patch_size, img_size // patch_size),
+ )
+ self.blocks.append(block)
+
+ self.neck = nn.Sequential(
+ nn.Conv2d(
+ embed_dim,
+ out_chans,
+ kernel_size=1,
+ bias=False,
+ ),
+ LayerNorm2d(out_chans),
+ nn.Conv2d(
+ out_chans,
+ out_chans,
+ kernel_size=3,
+ padding=1,
+ bias=False,
+ ),
+ LayerNorm2d(out_chans),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.patch_embed(x)
+ if self.pos_embed is not None:
+ x = x + self.pos_embed
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.neck(x.permute(0, 3, 1, 2))
+
+ return x
+
+
+class Block(nn.Module):
+ """Transformer blocks with support of window attention and residual propagation blocks"""
+
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ qkv_bias: bool = True,
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
+ act_layer: Type[nn.Module] = nn.GELU,
+ use_rel_pos: bool = False,
+ rel_pos_zero_init: bool = True,
+ window_size: int = 0,
+ input_size: Optional[Tuple[int, int]] = None,
+ ) -> None:
+ """
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads in each ViT block.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
+ norm_layer (nn.Module): Normalization layer.
+ act_layer (nn.Module): Activation layer.
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+ window_size (int): Window size for window attention blocks. If it equals 0, then
+ use global attention.
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
+ positional parameter size.
+ """
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ use_rel_pos=use_rel_pos,
+ rel_pos_zero_init=rel_pos_zero_init,
+ input_size=input_size if window_size == 0 else (window_size, window_size),
+ )
+
+ self.norm2 = norm_layer(dim)
+ self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+
+ self.window_size = window_size
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ shortcut = x
+ x = self.norm1(x)
+ # Window partition
+ if self.window_size > 0:
+ H, W = x.shape[1], x.shape[2]
+ x, pad_hw = window_partition(x, self.window_size)
+
+ x = self.attn(x)
+ # Reverse window partition
+ if self.window_size > 0:
+ x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+ x = shortcut + x
+ x = x + self.mlp(self.norm2(x))
+
+ return x
+
+
+class Attention(nn.Module):
+ """Multi-head Attention block with relative position embeddings."""
+
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = True,
+ use_rel_pos: bool = False,
+ rel_pos_zero_init: bool = True,
+ input_size: Optional[Tuple[int, int]] = None,
+ ) -> None:
+ """
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
+ rel_pos (bool): If True, add relative positional embeddings to the attention map.
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
+ positional parameter size.
+ """
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.proj = nn.Linear(dim, dim)
+
+ self.use_rel_pos = use_rel_pos
+ if self.use_rel_pos:
+ assert (
+ input_size is not None
+ ), "Input size must be provided if using relative positional encoding."
+ # initialize relative positional embeddings
+ self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+ self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ B, H, W, _ = x.shape
+ # qkv with shape (3, B, nHead, H * W, C)
+ qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+ # q, k, v with shape (B * nHead, H * W, C)
+ q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+ attn = (q * self.scale) @ k.transpose(-2, -1)
+
+ if self.use_rel_pos:
+ attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+
+ attn = attn.softmax(dim=-1)
+ x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+ x = self.proj(x)
+
+ return x
+
+
+def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+ """
+ Partition into non-overlapping windows with padding if needed.
+ Args:
+ x (tensor): input tokens with [B, H, W, C].
+ window_size (int): window size.
+
+ Returns:
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
+ (Hp, Wp): padded height and width before partition
+ """
+ B, H, W, C = x.shape
+
+ pad_h = (window_size - H % window_size) % window_size
+ pad_w = (window_size - W % window_size) % window_size
+ if pad_h > 0 or pad_w > 0:
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+ Hp, Wp = H + pad_h, W + pad_w
+
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows, (Hp, Wp)
+
+
+def window_unpartition(
+ windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
+) -> torch.Tensor:
+ """
+ Window unpartition into original sequences and removing padding.
+ Args:
+ windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+ window_size (int): window size.
+ pad_hw (Tuple): padded height and width (Hp, Wp).
+ hw (Tuple): original height and width (H, W) before padding.
+
+ Returns:
+ x: unpartitioned sequences with [B, H, W, C].
+ """
+ Hp, Wp = pad_hw
+ H, W = hw
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+ x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+ if Hp > H or Wp > W:
+ x = x[:, :H, :W, :].contiguous()
+ return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+ """
+ Get relative positional embeddings according to the relative positions of
+ query and key sizes.
+ Args:
+ q_size (int): size of query q.
+ k_size (int): size of key k.
+ rel_pos (Tensor): relative position embeddings (L, C).
+
+ Returns:
+ Extracted positional embeddings according to relative positions.
+ """
+ max_rel_dist = int(2 * max(q_size, k_size) - 1)
+ # Interpolate rel pos if needed.
+ if rel_pos.shape[0] != max_rel_dist:
+ # Interpolate rel pos.
+ rel_pos_resized = F.interpolate(
+ rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+ size=max_rel_dist,
+ mode="linear",
+ )
+ rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+ else:
+ rel_pos_resized = rel_pos
+
+ # Scale the coords with short length if shapes for q and k are different.
+ q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+ k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+ relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+ return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+ attn: torch.Tensor,
+ q: torch.Tensor,
+ rel_pos_h: torch.Tensor,
+ rel_pos_w: torch.Tensor,
+ q_size: Tuple[int, int],
+ k_size: Tuple[int, int],
+) -> torch.Tensor:
+ """
+ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+ https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
+ Args:
+ attn (Tensor): attention map.
+ q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+ rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+ rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+ q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+ k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+ Returns:
+ attn (Tensor): attention map with added relative positional embeddings.
+ """
+ q_h, q_w = q_size
+ k_h, k_w = k_size
+ Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+ Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+ B, _, dim = q.shape
+ r_q = q.reshape(B, q_h, q_w, dim)
+ rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+ rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+ attn = (
+ attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+ ).view(B, q_h * q_w, k_h * k_w)
+
+ return attn
+
+
+class PatchEmbed(nn.Module):
+ """
+ Image to Patch Embedding.
+ """
+
+ def __init__(
+ self,
+ kernel_size: Tuple[int, int] = (16, 16),
+ stride: Tuple[int, int] = (16, 16),
+ padding: Tuple[int, int] = (0, 0),
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ ) -> None:
+ """
+ Args:
+ kernel_size (Tuple): kernel size of the projection layer.
+ stride (Tuple): stride of the projection layer.
+ padding (Tuple): padding size of the projection layer.
+ in_chans (int): Number of input image channels.
+ embed_dim (int): Patch embedding dimension.
+ """
+ super().__init__()
+
+ self.proj = nn.Conv2d(
+ in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.proj(x)
+ # B C H W -> B H W C
+ x = x.permute(0, 2, 3, 1)
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/mask_decoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/mask_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4fdb868e1b0340d1bb6b1ee84a20eca27be455
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/mask_decoder.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from typing import List, Tuple, Type
+
+from .common import LayerNorm2d
+
+
+class MaskDecoder(nn.Module):
+ def __init__(
+ self,
+ *,
+ transformer_dim: int,
+ transformer: nn.Module,
+ num_multimask_outputs: int = 3,
+ activation: Type[nn.Module] = nn.GELU,
+ iou_head_depth: int = 3,
+ iou_head_hidden_dim: int = 256,
+ ) -> None:
+ """
+ Predicts masks given an image and prompt embeddings, using a
+ transformer architecture.
+
+ Arguments:
+ transformer_dim (int): the channel dimension of the transformer
+ transformer (nn.Module): the transformer used to predict masks
+ num_multimask_outputs (int): the number of masks to predict
+ when disambiguating masks
+ activation (nn.Module): the type of activation to use when
+ upscaling masks
+ iou_head_depth (int): the depth of the MLP used to predict
+ mask quality
+ iou_head_hidden_dim (int): the hidden dimension of the MLP
+ used to predict mask quality
+ """
+ super().__init__()
+ self.transformer_dim = transformer_dim
+ self.transformer = transformer
+
+ self.num_multimask_outputs = num_multimask_outputs
+
+ self.iou_token = nn.Embedding(1, transformer_dim)
+ self.num_mask_tokens = num_multimask_outputs + 1
+ self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+
+ self.output_upscaling = nn.Sequential(
+ nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
+ LayerNorm2d(transformer_dim // 4),
+ activation(),
+ nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
+ activation(),
+ )
+ self.output_hypernetworks_mlps = nn.ModuleList(
+ [
+ MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+ for i in range(self.num_mask_tokens)
+ ]
+ )
+
+ self.iou_prediction_head = MLP(
+ transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
+ )
+
+ def forward(
+ self,
+ image_embeddings: torch.Tensor,
+ image_pe: torch.Tensor,
+ sparse_prompt_embeddings: torch.Tensor,
+ dense_prompt_embeddings: torch.Tensor,
+ multimask_output: bool,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Predict masks given image and prompt embeddings.
+
+ Arguments:
+ image_embeddings (torch.Tensor): the embeddings from the image encoder
+ image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+ sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+ dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+ multimask_output (bool): Whether to return multiple masks or a single
+ mask.
+
+ Returns:
+ torch.Tensor: batched predicted masks
+ torch.Tensor: batched predictions of mask quality
+ """
+ masks, iou_pred = self.predict_masks(
+ image_embeddings=image_embeddings,
+ image_pe=image_pe,
+ sparse_prompt_embeddings=sparse_prompt_embeddings,
+ dense_prompt_embeddings=dense_prompt_embeddings,
+ )
+
+ # Select the correct mask or masks for output
+ if multimask_output:
+ mask_slice = slice(1, None)
+ else:
+ mask_slice = slice(0, 1)
+ masks = masks[:, mask_slice, :, :]
+ iou_pred = iou_pred[:, mask_slice]
+
+ # Prepare output
+ return masks, iou_pred
+
+ def predict_masks(
+ self,
+ image_embeddings: torch.Tensor,
+ image_pe: torch.Tensor,
+ sparse_prompt_embeddings: torch.Tensor,
+ dense_prompt_embeddings: torch.Tensor,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Predicts masks. See 'forward' for more details."""
+ # Concatenate output tokens
+ output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+ output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1)
+ tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+ # Expand per-image data in batch direction to be per-mask
+ src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+ src = src + dense_prompt_embeddings
+ pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+ b, c, h, w = src.shape
+
+ # Run the transformer
+ hs, src = self.transformer(src, pos_src, tokens)
+ iou_token_out = hs[:, 0, :]
+ mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+
+ # Upscale mask embeddings and predict masks using the mask tokens
+ src = src.transpose(1, 2).view(b, c, h, w)
+ upscaled_embedding = self.output_upscaling(src)
+ hyper_in_list: List[torch.Tensor] = []
+ for i in range(self.num_mask_tokens):
+ hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]))
+ hyper_in = torch.stack(hyper_in_list, dim=1)
+ b, c, h, w = upscaled_embedding.shape
+ masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+
+ # Generate mask quality predictions
+ iou_pred = self.iou_prediction_head(iou_token_out)
+
+ return masks, iou_pred
+
+
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+ def __init__(
+ self,
+ input_dim: int,
+ hidden_dim: int,
+ output_dim: int,
+ num_layers: int,
+ sigmoid_output: bool = False,
+ ) -> None:
+ super().__init__()
+ self.num_layers = num_layers
+ h = [hidden_dim] * (num_layers - 1)
+ self.layers = nn.ModuleList(
+ nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+ )
+ self.sigmoid_output = sigmoid_output
+
+ def forward(self, x):
+ for i, layer in enumerate(self.layers):
+ x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+ if self.sigmoid_output:
+ x = F.sigmoid(x)
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/prompt_encoder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/prompt_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f73520ad1318da91f271a623c8497c8b9a31475
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/prompt_encoder.py
@@ -0,0 +1,214 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch import nn
+
+from typing import Any, Optional, Tuple, Type
+
+from .common import LayerNorm2d
+
+
+class PromptEncoder(nn.Module):
+ def __init__(
+ self,
+ embed_dim: int,
+ image_embedding_size: Tuple[int, int],
+ input_image_size: Tuple[int, int],
+ mask_in_chans: int,
+ activation: Type[nn.Module] = nn.GELU,
+ ) -> None:
+ """
+ Encodes prompts for input to SAM's mask decoder.
+
+ Arguments:
+ embed_dim (int): The prompts' embedding dimension
+ image_embedding_size (tuple(int, int)): The spatial size of the
+ image embedding, as (H, W).
+ input_image_size (int): The padded size of the image as input
+ to the image encoder, as (H, W).
+ mask_in_chans (int): The number of hidden channels used for
+ encoding input masks.
+ activation (nn.Module): The activation to use when encoding
+ input masks.
+ """
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.input_image_size = input_image_size
+ self.image_embedding_size = image_embedding_size
+ self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+
+ self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners
+ point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
+ self.point_embeddings = nn.ModuleList(point_embeddings)
+ self.not_a_point_embed = nn.Embedding(1, embed_dim)
+
+ self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
+ self.mask_downscaling = nn.Sequential(
+ nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+ LayerNorm2d(mask_in_chans // 4),
+ activation(),
+ nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+ LayerNorm2d(mask_in_chans),
+ activation(),
+ nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+ )
+ self.no_mask_embed = nn.Embedding(1, embed_dim)
+
+ def get_dense_pe(self) -> torch.Tensor:
+ """
+ Returns the positional encoding used to encode point prompts,
+ applied to a dense set of points the shape of the image encoding.
+
+ Returns:
+ torch.Tensor: Positional encoding with shape
+ 1x(embed_dim)x(embedding_h)x(embedding_w)
+ """
+ return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+
+ def _embed_points(
+ self,
+ points: torch.Tensor,
+ labels: torch.Tensor,
+ pad: bool,
+ ) -> torch.Tensor:
+ """Embeds point prompts."""
+ points = points + 0.5 # Shift to center of pixel
+ if pad:
+ padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+ padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+ points = torch.cat([points, padding_point], dim=1)
+ labels = torch.cat([labels, padding_label], dim=1)
+ point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
+ point_embedding[labels == -1] = 0.0
+ point_embedding[labels == -1] += self.not_a_point_embed.weight
+ point_embedding[labels == 0] += self.point_embeddings[0].weight
+ point_embedding[labels == 1] += self.point_embeddings[1].weight
+ return point_embedding
+
+ def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+ """Embeds box prompts."""
+ boxes = boxes + 0.5 # Shift to center of pixel
+ coords = boxes.reshape(-1, 2, 2)
+ corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
+ corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+ corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+ return corner_embedding
+
+ def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+ """Embeds mask inputs."""
+ mask_embedding = self.mask_downscaling(masks)
+ return mask_embedding
+
+ def _get_batch_size(
+ self,
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+ boxes: Optional[torch.Tensor],
+ masks: Optional[torch.Tensor],
+ ) -> int:
+ """
+ Gets the batch size of the output given the batch size of the input prompts.
+ """
+ if points is not None:
+ return points[0].shape[0]
+ elif boxes is not None:
+ return boxes.shape[0]
+ elif masks is not None:
+ return masks.shape[0]
+ else:
+ return 1
+
+ def _get_device(self) -> torch.device:
+ return self.point_embeddings[0].weight.device
+
+ def forward(
+ self,
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+ boxes: Optional[torch.Tensor],
+ masks: Optional[torch.Tensor],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Embeds different types of prompts, returning both sparse and dense
+ embeddings.
+
+ Arguments:
+ points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+ and labels to embed.
+ boxes (torch.Tensor or none): boxes to embed
+ masks (torch.Tensor or none): masks to embed
+
+ Returns:
+ torch.Tensor: sparse embeddings for the points and boxes, with shape
+ BxNx(embed_dim), where N is determined by the number of input points
+ and boxes.
+ torch.Tensor: dense embeddings for the masks, in the shape
+ Bx(embed_dim)x(embed_H)x(embed_W)
+ """
+ bs = self._get_batch_size(points, boxes, masks)
+ sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
+ if points is not None:
+ coords, labels = points
+ point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+ sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+ if boxes is not None:
+ box_embeddings = self._embed_boxes(boxes)
+ sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+
+ if masks is not None:
+ dense_embeddings = self._embed_masks(masks)
+ else:
+ dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+ bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+ )
+
+ return sparse_embeddings, dense_embeddings
+
+
+class PositionEmbeddingRandom(nn.Module):
+ """
+ Positional encoding using random spatial frequencies.
+ """
+
+ def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+ super().__init__()
+ if scale is None or scale <= 0.0:
+ scale = 1.0
+ self.register_buffer(
+ "positional_encoding_gaussian_matrix",
+ scale * torch.randn((2, num_pos_feats)),
+ )
+
+ def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+ """Positionally encode points that are normalized to [0,1]."""
+ # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+ coords = 2 * coords - 1
+ coords = coords @ self.positional_encoding_gaussian_matrix
+ coords = 2 * np.pi * coords
+ # outputs d_1 x ... x d_n x C shape
+ return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+ def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+ """Generate positional encoding for a grid of the specified size."""
+ h, w = size
+ device: Any = self.positional_encoding_gaussian_matrix.device
+ grid = torch.ones((h, w), device=device, dtype=torch.float32)
+ y_embed = grid.cumsum(dim=0) - 0.5
+ x_embed = grid.cumsum(dim=1) - 0.5
+ y_embed = y_embed / h
+ x_embed = x_embed / w
+
+ pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+ return pe.permute(2, 0, 1) # C x H x W
+
+ def forward_with_coords(
+ self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+ ) -> torch.Tensor:
+ """Positionally encode points that are not normalized to [0,1]."""
+ coords = coords_input.clone()
+ coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+ coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+ return self._pe_encoding(coords.to(torch.float)) # B x N x C
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/sam.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee5303e9b5132098214b60e225a7b9a9d96caa4d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/sam.py
@@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from typing import Any, Dict, List, Tuple, Union
+
+from .tiny_vit_sam import TinyViT
+from .image_encoder import ImageEncoderViT
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+
+
+class Sam(nn.Module):
+ mask_threshold: float = 0.0
+ image_format: str = "RGB"
+
+ def __init__(
+ self,
+ image_encoder: Union[ImageEncoderViT, TinyViT],
+ prompt_encoder: PromptEncoder,
+ mask_decoder: MaskDecoder,
+ pixel_mean: List[float] = [123.675, 116.28, 103.53],
+ pixel_std: List[float] = [58.395, 57.12, 57.375],
+ ) -> None:
+ """
+ SAM predicts object masks from an image and input prompts.
+
+ Arguments:
+ image_encoder (ImageEncoderViT): The backbone used to encode the
+ image into image embeddings that allow for efficient mask prediction.
+ prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+ mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+ and encoded prompts.
+ pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+ pixel_std (list(float)): Std values for normalizing pixels in the input image.
+ """
+ super().__init__()
+ self.image_encoder = image_encoder
+ self.prompt_encoder = prompt_encoder
+ self.mask_decoder = mask_decoder
+ self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+ self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+
+ @property
+ def device(self) -> Any:
+ return self.pixel_mean.device
+
+ @torch.no_grad()
+ def forward(
+ self,
+ batched_input: List[Dict[str, Any]],
+ multimask_output: bool,
+ ) -> List[Dict[str, torch.Tensor]]:
+ """
+ Predicts masks end-to-end from provided images and prompts.
+ If prompts are not known in advance, using SamPredictor is
+ recommended over calling the model directly.
+
+ Arguments:
+ batched_input (list(dict)): A list over input images, each a
+ dictionary with the following keys. A prompt key can be
+ excluded if it is not present.
+ 'image': The image as a torch tensor in 3xHxW format,
+ already transformed for input to the model.
+ 'original_size': (tuple(int, int)) The original size of
+ the image before transformation, as (H, W).
+ 'point_coords': (torch.Tensor) Batched point prompts for
+ this image, with shape BxNx2. Already transformed to the
+ input frame of the model.
+ 'point_labels': (torch.Tensor) Batched labels for point prompts,
+ with shape BxN.
+ 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4.
+ Already transformed to the input frame of the model.
+ 'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
+ in the form Bx1xHxW.
+ multimask_output (bool): Whether the model should predict multiple
+ disambiguating masks, or return a single mask.
+
+ Returns:
+ (list(dict)): A list over input images, where each element is
+ as dictionary with the following keys.
+ 'masks': (torch.Tensor) Batched binary mask predictions,
+ with shape BxCxHxW, where B is the number of input prompts,
+ C is determined by multimask_output, and (H, W) is the
+ original size of the image.
+ 'iou_predictions': (torch.Tensor) The model's predictions
+ of mask quality, in shape BxC.
+ 'low_res_logits': (torch.Tensor) Low resolution logits with
+ shape BxCxHxW, where H=W=256. Can be passed as mask input
+ to subsequent iterations of prediction.
+ """
+ input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0)
+ image_embeddings = self.image_encoder(input_images)
+
+ outputs = []
+ for image_record, curr_embedding in zip(batched_input, image_embeddings):
+ if "point_coords" in image_record:
+ points = (image_record["point_coords"], image_record["point_labels"])
+ else:
+ points = None
+ sparse_embeddings, dense_embeddings = self.prompt_encoder(
+ points=points,
+ boxes=image_record.get("boxes", None),
+ masks=image_record.get("mask_inputs", None),
+ )
+ low_res_masks, iou_predictions = self.mask_decoder(
+ image_embeddings=curr_embedding.unsqueeze(0),
+ image_pe=self.prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embeddings,
+ dense_prompt_embeddings=dense_embeddings,
+ multimask_output=multimask_output,
+ )
+ masks = self.postprocess_masks(
+ low_res_masks,
+ input_size=image_record["image"].shape[-2:],
+ original_size=image_record["original_size"],
+ )
+ masks = masks > self.mask_threshold
+ outputs.append(
+ {
+ "masks": masks,
+ "iou_predictions": iou_predictions,
+ "low_res_logits": low_res_masks,
+ }
+ )
+ return outputs
+
+ def postprocess_masks(
+ self,
+ masks: torch.Tensor,
+ input_size: Tuple[int, ...],
+ original_size: Tuple[int, ...],
+ ) -> torch.Tensor:
+ """
+ Remove padding and upscale masks to the original image size.
+
+ Arguments:
+ masks (torch.Tensor): Batched masks from the mask_decoder,
+ in BxCxHxW format.
+ input_size (tuple(int, int)): The size of the image input to the
+ model, in (H, W) format. Used to remove padding.
+ original_size (tuple(int, int)): The original size of the image
+ before resizing for input to the model, in (H, W) format.
+
+ Returns:
+ (torch.Tensor): Batched masks in BxCxHxW format, where (H, W)
+ is given by original_size.
+ """
+ masks = F.interpolate(
+ masks,
+ (self.image_encoder.img_size, self.image_encoder.img_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+ masks = masks[..., : input_size[0], : input_size[1]]
+ masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False)
+ return masks
+
+ def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+ """Normalize pixel values and pad to a square input."""
+ # Normalize colors
+ x = (x - self.pixel_mean) / self.pixel_std
+
+ # Pad
+ h, w = x.shape[-2:]
+ padh = self.image_encoder.img_size - h
+ padw = self.image_encoder.img_size - w
+ x = F.pad(x, (0, padw, 0, padh))
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/tiny_vit_sam.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/tiny_vit_sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f5ce068ee1576bcfb5cc48ec00f6a3905db143f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/tiny_vit_sam.py
@@ -0,0 +1,716 @@
+# --------------------------------------------------------
+# TinyViT Model Architecture
+# Copyright (c) 2022 Microsoft
+# Adapted from LeViT and Swin Transformer
+# LeViT: (https://github.com/facebookresearch/levit)
+# Swin: (https://github.com/microsoft/swin-transformer)
+# Build the TinyViT Model
+# --------------------------------------------------------
+
+import itertools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from custom_timm.models.layers import DropPath as TimmDropPath,\
+ to_2tuple, trunc_normal_
+from custom_timm.models.registry import register_model
+from typing import Tuple
+
+
+class Conv2d_BN(torch.nn.Sequential):
+ def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+ groups=1, bn_weight_init=1):
+ super().__init__()
+ self.add_module('c', torch.nn.Conv2d(
+ a, b, ks, stride, pad, dilation, groups, bias=False))
+ bn = torch.nn.BatchNorm2d(b)
+ torch.nn.init.constant_(bn.weight, bn_weight_init)
+ torch.nn.init.constant_(bn.bias, 0)
+ self.add_module('bn', bn)
+
+ @torch.no_grad()
+ def fuse(self):
+ c, bn = self._modules.values()
+ w = bn.weight / (bn.running_var + bn.eps)**0.5
+ w = c.weight * w[:, None, None, None]
+ b = bn.bias - bn.running_mean * bn.weight / \
+ (bn.running_var + bn.eps)**0.5
+ m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+ 0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
+ m.weight.data.copy_(w)
+ m.bias.data.copy_(b)
+ return m
+
+
+class DropPath(TimmDropPath):
+ def __init__(self, drop_prob=None):
+ super().__init__(drop_prob=drop_prob)
+ self.drop_prob = drop_prob
+
+ def __repr__(self):
+ msg = super().__repr__()
+ msg += f'(drop_prob={self.drop_prob})'
+ return msg
+
+
+class PatchEmbed(nn.Module):
+ def __init__(self, in_chans, embed_dim, resolution, activation):
+ super().__init__()
+ img_size: Tuple[int, int] = to_2tuple(resolution)
+ self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
+ self.num_patches = self.patches_resolution[0] * \
+ self.patches_resolution[1]
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+ n = embed_dim
+ self.seq = nn.Sequential(
+ Conv2d_BN(in_chans, n // 2, 3, 2, 1),
+ activation(),
+ Conv2d_BN(n // 2, n, 3, 2, 1),
+ )
+
+ def forward(self, x):
+ return self.seq(x)
+
+
+class MBConv(nn.Module):
+ def __init__(self, in_chans, out_chans, expand_ratio,
+ activation, drop_path):
+ super().__init__()
+ self.in_chans = in_chans
+ self.hidden_chans = int(in_chans * expand_ratio)
+ self.out_chans = out_chans
+
+ self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
+ self.act1 = activation()
+
+ self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans,
+ ks=3, stride=1, pad=1, groups=self.hidden_chans)
+ self.act2 = activation()
+
+ self.conv3 = Conv2d_BN(
+ self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
+ self.act3 = activation()
+
+ self.drop_path = DropPath(
+ drop_path) if drop_path > 0. else nn.Identity()
+
+ def forward(self, x):
+ shortcut = x
+
+ x = self.conv1(x)
+ x = self.act1(x)
+
+ x = self.conv2(x)
+ x = self.act2(x)
+
+ x = self.conv3(x)
+
+ x = self.drop_path(x)
+
+ x += shortcut
+ x = self.act3(x)
+
+ return x
+
+
+class PatchMerging(nn.Module):
+ def __init__(self, input_resolution, dim, out_dim, activation):
+ super().__init__()
+
+ self.input_resolution = input_resolution
+ self.dim = dim
+ self.out_dim = out_dim
+ self.act = activation()
+ self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
+ stride_c=2
+ if(out_dim==320 or out_dim==448 or out_dim==576):
+ stride_c=1
+ self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+ self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
+
+ def forward(self, x):
+ if x.ndim == 3:
+ H, W = self.input_resolution
+ B = len(x)
+ # (B, C, H, W)
+ x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+
+ x = self.conv1(x)
+ x = self.act(x)
+
+ x = self.conv2(x)
+ x = self.act(x)
+ x = self.conv3(x)
+ x = x.flatten(2).transpose(1, 2)
+ return x
+
+
+class ConvLayer(nn.Module):
+ def __init__(self, dim, input_resolution, depth,
+ activation,
+ drop_path=0., downsample=None, use_checkpoint=False,
+ out_dim=None,
+ conv_expand_ratio=4.,
+ ):
+
+ super().__init__()
+ self.dim = dim
+ self.input_resolution = input_resolution
+ self.depth = depth
+ self.use_checkpoint = use_checkpoint
+
+ # build blocks
+ self.blocks = nn.ModuleList([
+ MBConv(dim, dim, conv_expand_ratio, activation,
+ drop_path[i] if isinstance(drop_path, list) else drop_path,
+ )
+ for i in range(depth)])
+
+ # patch merging layer
+ if downsample is not None:
+ self.downsample = downsample(
+ input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+ else:
+ self.downsample = None
+
+ def forward(self, x):
+ for blk in self.blocks:
+ if self.use_checkpoint:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ if self.downsample is not None:
+ x = self.downsample(x)
+ return x
+
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None,
+ out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.norm = nn.LayerNorm(in_features)
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.act = act_layer()
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.norm(x)
+
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class Attention(torch.nn.Module):
+ def __init__(self, dim, key_dim, num_heads=8,
+ attn_ratio=4,
+ resolution=(14, 14),
+ ):
+ super().__init__()
+ # (h, w)
+ assert isinstance(resolution, tuple) and len(resolution) == 2
+ self.num_heads = num_heads
+ self.scale = key_dim ** -0.5
+ self.key_dim = key_dim
+ self.nh_kd = nh_kd = key_dim * num_heads
+ self.d = int(attn_ratio * key_dim)
+ self.dh = int(attn_ratio * key_dim) * num_heads
+ self.attn_ratio = attn_ratio
+ h = self.dh + nh_kd * 2
+
+ self.norm = nn.LayerNorm(dim)
+ self.qkv = nn.Linear(dim, h)
+ self.proj = nn.Linear(self.dh, dim)
+
+ points = list(itertools.product(
+ range(resolution[0]), range(resolution[1])))
+ N = len(points)
+ attention_offsets = {}
+ idxs = []
+ for p1 in points:
+ for p2 in points:
+ offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+ if offset not in attention_offsets:
+ attention_offsets[offset] = len(attention_offsets)
+ idxs.append(attention_offsets[offset])
+ self.attention_biases = torch.nn.Parameter(
+ torch.zeros(num_heads, len(attention_offsets)))
+ self.register_buffer('attention_bias_idxs',
+ torch.LongTensor(idxs).view(N, N),
+ persistent=False)
+
+ @torch.no_grad()
+ def train(self, mode=True):
+ super().train(mode)
+ if mode and hasattr(self, 'ab'):
+ del self.ab
+ else:
+ self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+ def forward(self, x): # x (B,N,C)
+ B, N, _ = x.shape
+
+ # Normalization
+ x = self.norm(x)
+
+ qkv = self.qkv(x)
+ # (B, N, num_heads, d)
+ q, k, v = qkv.view(B, N, self.num_heads, -
+ 1).split([self.key_dim, self.key_dim, self.d], dim=3)
+ # (B, num_heads, N, d)
+ q = q.permute(0, 2, 1, 3)
+ k = k.permute(0, 2, 1, 3)
+ v = v.permute(0, 2, 1, 3)
+
+ attn = (
+ (q @ k.transpose(-2, -1)) * self.scale
+ +
+ (self.attention_biases[:, self.attention_bias_idxs]
+ if self.training else self.ab)
+ )
+ attn = attn.softmax(dim=-1)
+ x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+ x = self.proj(x)
+ return x
+
+
+class TinyViTBlock(nn.Module):
+ r""" TinyViT Block.
+
+ Args:
+ dim (int): Number of input channels.
+ input_resolution (tuple[int, int]): Input resulotion.
+ num_heads (int): Number of attention heads.
+ window_size (int): Window size.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ drop (float, optional): Dropout rate. Default: 0.0
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
+ local_conv_size (int): the kernel size of the convolution between
+ Attention and MLP. Default: 3
+ activation: the activation function. Default: nn.GELU
+ """
+
+ def __init__(self, dim, input_resolution, num_heads, window_size=7,
+ mlp_ratio=4., drop=0., drop_path=0.,
+ local_conv_size=3,
+ activation=nn.GELU,
+ ):
+ super().__init__()
+ self.dim = dim
+ self.input_resolution = input_resolution
+ self.num_heads = num_heads
+ assert window_size > 0, 'window_size must be greater than 0'
+ self.window_size = window_size
+ self.mlp_ratio = mlp_ratio
+
+ self.drop_path = DropPath(
+ drop_path) if drop_path > 0. else nn.Identity()
+
+ assert dim % num_heads == 0, 'dim must be divisible by num_heads'
+ head_dim = dim // num_heads
+
+ window_resolution = (window_size, window_size)
+ self.attn = Attention(dim, head_dim, num_heads,
+ attn_ratio=1, resolution=window_resolution)
+
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ mlp_activation = activation
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+ act_layer=mlp_activation, drop=drop)
+
+ pad = local_conv_size // 2
+ self.local_conv = Conv2d_BN(
+ dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
+
+ def forward(self, x):
+ H, W = self.input_resolution
+ B, L, C = x.shape
+ assert L == H * W, "input feature has wrong size"
+ res_x = x
+ if H == self.window_size and W == self.window_size:
+ x = self.attn(x)
+ else:
+ x = x.view(B, H, W, C)
+ pad_b = (self.window_size - H %
+ self.window_size) % self.window_size
+ pad_r = (self.window_size - W %
+ self.window_size) % self.window_size
+ padding = pad_b > 0 or pad_r > 0
+
+ if padding:
+ x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+
+ pH, pW = H + pad_b, W + pad_r
+ nH = pH // self.window_size
+ nW = pW // self.window_size
+ # window partition
+ x = x.view(B, nH, self.window_size, nW, self.window_size, C).transpose(2, 3).reshape(
+ B * nH * nW, self.window_size * self.window_size, C)
+ x = self.attn(x)
+ # window reverse
+ x = x.view(B, nH, nW, self.window_size, self.window_size,
+ C).transpose(2, 3).reshape(B, pH, pW, C)
+
+ if padding:
+ x = x[:, :H, :W].contiguous()
+
+ x = x.view(B, L, C)
+
+ x = res_x + self.drop_path(x)
+
+ x = x.transpose(1, 2).reshape(B, C, H, W)
+ x = self.local_conv(x)
+ x = x.view(B, C, L).transpose(1, 2)
+
+ x = x + self.drop_path(self.mlp(x))
+ return x
+
+ def extra_repr(self) -> str:
+ return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+ f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+
+
+class BasicLayer(nn.Module):
+ """ A basic TinyViT layer for one stage.
+
+ Args:
+ dim (int): Number of input channels.
+ input_resolution (tuple[int]): Input resolution.
+ depth (int): Number of blocks.
+ num_heads (int): Number of attention heads.
+ window_size (int): Local window size.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ drop (float, optional): Dropout rate. Default: 0.0
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+ local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
+ activation: the activation function. Default: nn.GELU
+ out_dim: the output dimension of the layer. Default: dim
+ """
+
+ def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+ mlp_ratio=4., drop=0.,
+ drop_path=0., downsample=None, use_checkpoint=False,
+ local_conv_size=3,
+ activation=nn.GELU,
+ out_dim=None,
+ ):
+
+ super().__init__()
+ self.dim = dim
+ self.input_resolution = input_resolution
+ self.depth = depth
+ self.use_checkpoint = use_checkpoint
+
+ # build blocks
+ self.blocks = nn.ModuleList([
+ TinyViTBlock(dim=dim, input_resolution=input_resolution,
+ num_heads=num_heads, window_size=window_size,
+ mlp_ratio=mlp_ratio,
+ drop=drop,
+ drop_path=drop_path[i] if isinstance(
+ drop_path, list) else drop_path,
+ local_conv_size=local_conv_size,
+ activation=activation,
+ )
+ for i in range(depth)])
+
+ # patch merging layer
+ if downsample is not None:
+ self.downsample = downsample(
+ input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+ else:
+ self.downsample = None
+
+ def forward(self, x):
+ for blk in self.blocks:
+ if self.use_checkpoint:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ if self.downsample is not None:
+ x = self.downsample(x)
+ return x
+
+ def extra_repr(self) -> str:
+ return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+class LayerNorm2d(nn.Module):
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(num_channels))
+ self.bias = nn.Parameter(torch.zeros(num_channels))
+ self.eps = eps
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
+class TinyViT(nn.Module):
+ def __init__(self, img_size=224, in_chans=3, num_classes=1000,
+ embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_sizes=[7, 7, 14, 7],
+ mlp_ratio=4.,
+ drop_rate=0.,
+ drop_path_rate=0.1,
+ use_checkpoint=False,
+ mbconv_expand_ratio=4.0,
+ local_conv_size=3,
+ layer_lr_decay=1.0,
+ ):
+ super().__init__()
+ self.img_size=img_size
+ self.num_classes = num_classes
+ self.depths = depths
+ self.num_layers = len(depths)
+ self.mlp_ratio = mlp_ratio
+
+ activation = nn.GELU
+
+ self.patch_embed = PatchEmbed(in_chans=in_chans,
+ embed_dim=embed_dims[0],
+ resolution=img_size,
+ activation=activation)
+
+ patches_resolution = self.patch_embed.patches_resolution
+ self.patches_resolution = patches_resolution
+
+ # stochastic depth
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
+ sum(depths))] # stochastic depth decay rule
+
+ # build layers
+ self.layers = nn.ModuleList()
+ for i_layer in range(self.num_layers):
+ kwargs = dict(dim=embed_dims[i_layer],
+ input_resolution=(patches_resolution[0] // (2 ** (i_layer-1 if i_layer == 3 else i_layer)),
+ patches_resolution[1] // (2 ** (i_layer-1 if i_layer == 3 else i_layer))),
+ # input_resolution=(patches_resolution[0] // (2 ** i_layer),
+ # patches_resolution[1] // (2 ** i_layer)),
+ depth=depths[i_layer],
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+ downsample=PatchMerging if (
+ i_layer < self.num_layers - 1) else None,
+ use_checkpoint=use_checkpoint,
+ out_dim=embed_dims[min(
+ i_layer + 1, len(embed_dims) - 1)],
+ activation=activation,
+ )
+ if i_layer == 0:
+ layer = ConvLayer(
+ conv_expand_ratio=mbconv_expand_ratio,
+ **kwargs,
+ )
+ else:
+ layer = BasicLayer(
+ num_heads=num_heads[i_layer],
+ window_size=window_sizes[i_layer],
+ mlp_ratio=self.mlp_ratio,
+ drop=drop_rate,
+ local_conv_size=local_conv_size,
+ **kwargs)
+ self.layers.append(layer)
+
+ # Classifier head
+ self.norm_head = nn.LayerNorm(embed_dims[-1])
+ self.head = nn.Linear(
+ embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+
+ # init weights
+ self.apply(self._init_weights)
+ self.set_layer_lr_decay(layer_lr_decay)
+ self.neck = nn.Sequential(
+ nn.Conv2d(
+ embed_dims[-1],
+ 256,
+ kernel_size=1,
+ bias=False,
+ ),
+ LayerNorm2d(256),
+ nn.Conv2d(
+ 256,
+ 256,
+ kernel_size=3,
+ padding=1,
+ bias=False,
+ ),
+ LayerNorm2d(256),
+ )
+ def set_layer_lr_decay(self, layer_lr_decay):
+ decay_rate = layer_lr_decay
+
+ # layers -> blocks (depth)
+ depth = sum(self.depths)
+ lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
+ #print("LR SCALES:", lr_scales)
+
+ def _set_lr_scale(m, scale):
+ for p in m.parameters():
+ p.lr_scale = scale
+
+ self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
+ i = 0
+ for layer in self.layers:
+ for block in layer.blocks:
+ block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
+ i += 1
+ if layer.downsample is not None:
+ layer.downsample.apply(
+ lambda x: _set_lr_scale(x, lr_scales[i - 1]))
+ assert i == depth
+ for m in [self.norm_head, self.head]:
+ m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
+
+ for k, p in self.named_parameters():
+ p.param_name = k
+
+ def _check_lr_scale(m):
+ for p in m.parameters():
+ assert hasattr(p, 'lr_scale'), p.param_name
+
+ self.apply(_check_lr_scale)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay_keywords(self):
+ return {'attention_biases'}
+
+ def forward_features(self, x):
+ # x: (N, C, H, W)
+ x = self.patch_embed(x)
+
+ x = self.layers[0](x)
+ start_i = 1
+
+ for i in range(start_i, len(self.layers)):
+ layer = self.layers[i]
+ x = layer(x)
+ B,_,C=x.size()
+ x = x.view(B, 64, 64, C)
+ x=x.permute(0, 3, 1, 2)
+ x=self.neck(x)
+ return x
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ #x = self.norm_head(x)
+ #x = self.head(x)
+ return x
+
+
+_checkpoint_url_format = \
+ 'https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/{}.pth'
+_provided_checkpoints = {
+ 'tiny_vit_5m_224': 'tiny_vit_5m_22kto1k_distill',
+ 'tiny_vit_11m_224': 'tiny_vit_11m_22kto1k_distill',
+ 'tiny_vit_21m_224': 'tiny_vit_21m_22kto1k_distill',
+ 'tiny_vit_21m_384': 'tiny_vit_21m_22kto1k_384_distill',
+ 'tiny_vit_21m_512': 'tiny_vit_21m_22kto1k_512_distill',
+}
+
+
+def register_tiny_vit_model(fn):
+ '''Register a TinyViT model
+ It is a wrapper of `register_model` with loading the pretrained checkpoint.
+ '''
+ def fn_wrapper(pretrained=False, **kwargs):
+ model = fn()
+ if pretrained:
+ model_name = fn.__name__
+ assert model_name in _provided_checkpoints, \
+ f'Sorry that the checkpoint `{model_name}` is not provided yet.'
+ url = _checkpoint_url_format.format(
+ _provided_checkpoints[model_name])
+ checkpoint = torch.hub.load_state_dict_from_url(
+ url=url,
+ map_location='cpu', check_hash=False,
+ )
+ model.load_state_dict(checkpoint['model'])
+
+ return model
+
+ # rename the name of fn_wrapper
+ fn_wrapper.__name__ = fn.__name__
+ return register_model(fn_wrapper)
+
+
+@register_tiny_vit_model
+def tiny_vit_5m_224(pretrained=False, num_classes=1000, drop_path_rate=0.0):
+ return TinyViT(
+ num_classes=num_classes,
+ embed_dims=[64, 128, 160, 320],
+ depths=[2, 2, 6, 2],
+ num_heads=[2, 4, 5, 10],
+ window_sizes=[7, 7, 14, 7],
+ drop_path_rate=drop_path_rate,
+ )
+
+
+@register_tiny_vit_model
+def tiny_vit_11m_224(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+ return TinyViT(
+ num_classes=num_classes,
+ embed_dims=[64, 128, 256, 448],
+ depths=[2, 2, 6, 2],
+ num_heads=[2, 4, 8, 14],
+ window_sizes=[7, 7, 14, 7],
+ drop_path_rate=drop_path_rate,
+ )
+
+
+@register_tiny_vit_model
+def tiny_vit_21m_224(pretrained=False, num_classes=1000, drop_path_rate=0.2):
+ return TinyViT(
+ num_classes=num_classes,
+ embed_dims=[96, 192, 384, 576],
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 18],
+ window_sizes=[7, 7, 14, 7],
+ drop_path_rate=drop_path_rate,
+ )
+
+
+@register_tiny_vit_model
+def tiny_vit_21m_384(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+ return TinyViT(
+ img_size=384,
+ num_classes=num_classes,
+ embed_dims=[96, 192, 384, 576],
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 18],
+ window_sizes=[12, 12, 24, 12],
+ drop_path_rate=drop_path_rate,
+ )
+
+
+@register_tiny_vit_model
+def tiny_vit_21m_512(pretrained=False, num_classes=1000, drop_path_rate=0.1):
+ return TinyViT(
+ img_size=512,
+ num_classes=num_classes,
+ embed_dims=[96, 192, 384, 576],
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 18],
+ window_sizes=[16, 16, 32, 16],
+ drop_path_rate=drop_path_rate,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/transformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d99f8e8265b5780dd3be1d8c6bbd33156ac1d8f4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/modeling/transformer.py
@@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import Tensor, nn
+
+import math
+from typing import Tuple, Type
+
+from .common import MLPBlock
+
+
+class TwoWayTransformer(nn.Module):
+ def __init__(
+ self,
+ depth: int,
+ embedding_dim: int,
+ num_heads: int,
+ mlp_dim: int,
+ activation: Type[nn.Module] = nn.ReLU,
+ attention_downsample_rate: int = 2,
+ ) -> None:
+ """
+ A transformer decoder that attends to an input image using
+ queries whose positional embedding is supplied.
+
+ Args:
+ depth (int): number of layers in the transformer
+ embedding_dim (int): the channel dimension for the input embeddings
+ num_heads (int): the number of heads for multihead attention. Must
+ divide embedding_dim
+ mlp_dim (int): the channel dimension internal to the MLP block
+ activation (nn.Module): the activation to use in the MLP block
+ """
+ super().__init__()
+ self.depth = depth
+ self.embedding_dim = embedding_dim
+ self.num_heads = num_heads
+ self.mlp_dim = mlp_dim
+ self.layers = nn.ModuleList()
+
+ for i in range(depth):
+ self.layers.append(
+ TwoWayAttentionBlock(
+ embedding_dim=embedding_dim,
+ num_heads=num_heads,
+ mlp_dim=mlp_dim,
+ activation=activation,
+ attention_downsample_rate=attention_downsample_rate,
+ skip_first_layer_pe=(i == 0),
+ )
+ )
+
+ self.final_attn_token_to_image = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+ self.norm_final_attn = nn.LayerNorm(embedding_dim)
+
+ def forward(
+ self,
+ image_embedding: Tensor,
+ image_pe: Tensor,
+ point_embedding: Tensor,
+ ) -> Tuple[Tensor, Tensor]:
+ """
+ Args:
+ image_embedding (torch.Tensor): image to attend to. Should be shape
+ B x embedding_dim x h x w for any h and w.
+ image_pe (torch.Tensor): the positional encoding to add to the image. Must
+ have the same shape as image_embedding.
+ point_embedding (torch.Tensor): the embedding to add to the query points.
+ Must have shape B x N_points x embedding_dim for any N_points.
+
+ Returns:
+ torch.Tensor: the processed point_embedding
+ torch.Tensor: the processed image_embedding
+ """
+ # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+ bs, c, h, w = image_embedding.shape
+ image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+ image_pe = image_pe.flatten(2).permute(0, 2, 1)
+
+ # Prepare queries
+ queries = point_embedding
+ keys = image_embedding
+
+ # Apply transformer blocks and final layernorm
+ for layer in self.layers:
+ queries, keys = layer(
+ queries=queries,
+ keys=keys,
+ query_pe=point_embedding,
+ key_pe=image_pe,
+ )
+
+ # Apply the final attention layer from the points to the image
+ q = queries + point_embedding
+ k = keys + image_pe
+ attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+ queries = queries + attn_out
+ queries = self.norm_final_attn(queries)
+
+ return queries, keys
+
+
+class TwoWayAttentionBlock(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ num_heads: int,
+ mlp_dim: int = 2048,
+ activation: Type[nn.Module] = nn.ReLU,
+ attention_downsample_rate: int = 2,
+ skip_first_layer_pe: bool = False,
+ ) -> None:
+ """
+ A transformer block with four layers: (1) self-attention of sparse
+ inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+ block on sparse inputs, and (4) cross attention of dense inputs to sparse
+ inputs.
+
+ Arguments:
+ embedding_dim (int): the channel dimension of the embeddings
+ num_heads (int): the number of heads in the attention layers
+ mlp_dim (int): the hidden dimension of the mlp block
+ activation (nn.Module): the activation of the mlp block
+ skip_first_layer_pe (bool): skip the PE on the first layer
+ """
+ super().__init__()
+ self.self_attn = Attention(embedding_dim, num_heads)
+ self.norm1 = nn.LayerNorm(embedding_dim)
+
+ self.cross_attn_token_to_image = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+ self.norm2 = nn.LayerNorm(embedding_dim)
+
+ self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+ self.norm3 = nn.LayerNorm(embedding_dim)
+
+ self.norm4 = nn.LayerNorm(embedding_dim)
+ self.cross_attn_image_to_token = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+
+ self.skip_first_layer_pe = skip_first_layer_pe
+
+ def forward(
+ self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+ ) -> Tuple[Tensor, Tensor]:
+ # Self attention block
+ if self.skip_first_layer_pe:
+ queries = self.self_attn(q=queries, k=queries, v=queries)
+ else:
+ q = queries + query_pe
+ attn_out = self.self_attn(q=q, k=q, v=queries)
+ queries = queries + attn_out
+ queries = self.norm1(queries)
+
+ # Cross attention block, tokens attending to image embedding
+ q = queries + query_pe
+ k = keys + key_pe
+ attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+ queries = queries + attn_out
+ queries = self.norm2(queries)
+
+ # MLP block
+ mlp_out = self.mlp(queries)
+ queries = queries + mlp_out
+ queries = self.norm3(queries)
+
+ # Cross attention block, image embedding attending to tokens
+ q = queries + query_pe
+ k = keys + key_pe
+ attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+ keys = keys + attn_out
+ keys = self.norm4(keys)
+
+ return queries, keys
+
+
+class Attention(nn.Module):
+ """
+ An attention layer that allows for downscaling the size of the embedding
+ after projection to queries, keys, and values.
+ """
+
+ def __init__(
+ self,
+ embedding_dim: int,
+ num_heads: int,
+ downsample_rate: int = 1,
+ ) -> None:
+ super().__init__()
+ self.embedding_dim = embedding_dim
+ self.internal_dim = embedding_dim // downsample_rate
+ self.num_heads = num_heads
+ assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
+
+ self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+
+ def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+ b, n, c = x.shape
+ x = x.reshape(b, n, num_heads, c // num_heads)
+ return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head
+
+ def _recombine_heads(self, x: Tensor) -> Tensor:
+ b, n_heads, n_tokens, c_per_head = x.shape
+ x = x.transpose(1, 2)
+ return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
+
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+ # Input projections
+ q = self.q_proj(q)
+ k = self.k_proj(k)
+ v = self.v_proj(v)
+
+ # Separate into heads
+ q = self._separate_heads(q, self.num_heads)
+ k = self._separate_heads(k, self.num_heads)
+ v = self._separate_heads(v, self.num_heads)
+
+ # Attention
+ _, _, _, c_per_head = q.shape
+ attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens
+ attn = attn / math.sqrt(c_per_head)
+ attn = torch.softmax(attn, dim=-1)
+
+ # Get output
+ out = attn @ v
+ out = self._recombine_heads(out)
+ out = self.out_proj(out)
+
+ return out
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/predictor.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3014d3db5112bd6b2ea408872fa7fbf0b6592902
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/predictor.py
@@ -0,0 +1,269 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from .modeling import Sam
+
+from typing import Optional, Tuple
+
+from .utils.transforms import ResizeLongestSide
+
+
+class SamPredictor:
+ def __init__(
+ self,
+ sam_model: Sam,
+ ) -> None:
+ """
+ Uses SAM to calculate the image embedding for an image, and then
+ allow repeated, efficient mask prediction given prompts.
+
+ Arguments:
+ sam_model (Sam): The model to use for mask prediction.
+ """
+ super().__init__()
+ self.model = sam_model
+ self.transform = ResizeLongestSide(sam_model.image_encoder.img_size)
+ self.reset_image()
+
+ def set_image(
+ self,
+ image: np.ndarray,
+ image_format: str = "RGB",
+ ) -> None:
+ """
+ Calculates the image embeddings for the provided image, allowing
+ masks to be predicted with the 'predict' method.
+
+ Arguments:
+ image (np.ndarray): The image for calculating masks. Expects an
+ image in HWC uint8 format, with pixel values in [0, 255].
+ image_format (str): The color format of the image, in ['RGB', 'BGR'].
+ """
+ assert image_format in [
+ "RGB",
+ "BGR",
+ ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+ if image_format != self.model.image_format:
+ image = image[..., ::-1]
+
+ # Transform the image to the form expected by the model
+ input_image = self.transform.apply_image(image)
+ input_image_torch = torch.as_tensor(input_image, device=self.device)
+ input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
+
+ self.set_torch_image(input_image_torch, image.shape[:2])
+
+ @torch.no_grad()
+ def set_torch_image(
+ self,
+ transformed_image: torch.Tensor,
+ original_image_size: Tuple[int, ...],
+ ) -> None:
+ """
+ Calculates the image embeddings for the provided image, allowing
+ masks to be predicted with the 'predict' method. Expects the input
+ image to be already transformed to the format expected by the model.
+
+ Arguments:
+ transformed_image (torch.Tensor): The input image, with shape
+ 1x3xHxW, which has been transformed with ResizeLongestSide.
+ original_image_size (tuple(int, int)): The size of the image
+ before transformation, in (H, W) format.
+ """
+ assert (
+ len(transformed_image.shape) == 4
+ and transformed_image.shape[1] == 3
+ and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size
+ ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}."
+ self.reset_image()
+
+ self.original_size = original_image_size
+ self.input_size = tuple(transformed_image.shape[-2:])
+ input_image = self.model.preprocess(transformed_image)
+ self.features = self.model.image_encoder(input_image)
+ self.is_image_set = True
+
+ def predict(
+ self,
+ point_coords: Optional[np.ndarray] = None,
+ point_labels: Optional[np.ndarray] = None,
+ box: Optional[np.ndarray] = None,
+ mask_input: Optional[np.ndarray] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Predict masks for the given input prompts, using the currently set image.
+
+ Arguments:
+ point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+ model. Each point is in (X,Y) in pixels.
+ point_labels (np.ndarray or None): A length N array of labels for the
+ point prompts. 1 indicates a foreground point and 0 indicates a
+ background point.
+ box (np.ndarray or None): A length 4 array given a box prompt to the
+ model, in XYXY format.
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
+ coming from a previous prediction iteration. Has form 1xHxW, where
+ for SAM, H=W=256.
+ multimask_output (bool): If true, the model will return three masks.
+ For ambiguous input prompts (such as a single click), this will often
+ produce better masks than a single prediction. If only a single
+ mask is needed, the model's predicted quality score can be used
+ to select the best mask. For non-ambiguous prompts, such as multiple
+ input prompts, multimask_output=False can give better results.
+ return_logits (bool): If true, returns un-thresholded masks logits
+ instead of a binary mask.
+
+ Returns:
+ (np.ndarray): The output masks in CxHxW format, where C is the
+ number of masks, and (H, W) is the original image size.
+ (np.ndarray): An array of length C containing the model's
+ predictions for the quality of each mask.
+ (np.ndarray): An array of shape CxHxW, where C is the number
+ of masks and H=W=256. These low resolution logits can be passed to
+ a subsequent iteration as mask input.
+ """
+ if not self.is_image_set:
+ raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+ # Transform input prompts
+ coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
+ if point_coords is not None:
+ assert (
+ point_labels is not None
+ ), "point_labels must be supplied if point_coords is supplied."
+ point_coords = self.transform.apply_coords(point_coords, self.original_size)
+ coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.device)
+ labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+ coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
+ if box is not None:
+ box = self.transform.apply_boxes(box, self.original_size)
+ box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
+ box_torch = box_torch[None, :]
+ if mask_input is not None:
+ mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.device)
+ mask_input_torch = mask_input_torch[None, :, :, :]
+
+ masks, iou_predictions, low_res_masks = self.predict_torch(
+ coords_torch,
+ labels_torch,
+ box_torch,
+ mask_input_torch,
+ multimask_output,
+ return_logits=return_logits,
+ )
+
+ masks_np = masks[0].detach().cpu().numpy()
+ iou_predictions_np = iou_predictions[0].detach().cpu().numpy()
+ low_res_masks_np = low_res_masks[0].detach().cpu().numpy()
+ return masks_np, iou_predictions_np, low_res_masks_np
+
+ @torch.no_grad()
+ def predict_torch(
+ self,
+ point_coords: Optional[torch.Tensor],
+ point_labels: Optional[torch.Tensor],
+ boxes: Optional[torch.Tensor] = None,
+ mask_input: Optional[torch.Tensor] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Predict masks for the given input prompts, using the currently set image.
+ Input prompts are batched torch tensors and are expected to already be
+ transformed to the input frame using ResizeLongestSide.
+
+ Arguments:
+ point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+ model. Each point is in (X,Y) in pixels.
+ point_labels (torch.Tensor or None): A BxN array of labels for the
+ point prompts. 1 indicates a foreground point and 0 indicates a
+ background point.
+ boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+ model, in XYXY format.
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
+ coming from a previous prediction iteration. Has form Bx1xHxW, where
+ for SAM, H=W=256. Masks returned by a previous iteration of the
+ predict method do not need further transformation.
+ multimask_output (bool): If true, the model will return three masks.
+ For ambiguous input prompts (such as a single click), this will often
+ produce better masks than a single prediction. If only a single
+ mask is needed, the model's predicted quality score can be used
+ to select the best mask. For non-ambiguous prompts, such as multiple
+ input prompts, multimask_output=False can give better results.
+ return_logits (bool): If true, returns un-thresholded masks logits
+ instead of a binary mask.
+
+ Returns:
+ (torch.Tensor): The output masks in BxCxHxW format, where C is the
+ number of masks, and (H, W) is the original image size.
+ (torch.Tensor): An array of shape BxC containing the model's
+ predictions for the quality of each mask.
+ (torch.Tensor): An array of shape BxCxHxW, where C is the number
+ of masks and H=W=256. These low res logits can be passed to
+ a subsequent iteration as mask input.
+ """
+ if not self.is_image_set:
+ raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+ if point_coords is not None:
+ points = (point_coords, point_labels)
+ else:
+ points = None
+
+ # Embed prompts
+ sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
+ points=points,
+ boxes=boxes,
+ masks=mask_input,
+ )
+
+ # Predict masks
+ low_res_masks, iou_predictions = self.model.mask_decoder(
+ image_embeddings=self.features,
+ image_pe=self.model.prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embeddings,
+ dense_prompt_embeddings=dense_embeddings,
+ multimask_output=multimask_output,
+ )
+
+ # Upscale the masks to the original image resolution
+ masks = self.model.postprocess_masks(low_res_masks, self.input_size, self.original_size)
+
+ if not return_logits:
+ masks = masks > self.model.mask_threshold
+
+ return masks, iou_predictions, low_res_masks
+
+ def get_image_embedding(self) -> torch.Tensor:
+ """
+ Returns the image embeddings for the currently set image, with
+ shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+ the embedding spatial dimension of SAM (typically C=256, H=W=64).
+ """
+ if not self.is_image_set:
+ raise RuntimeError(
+ "An image must be set with .set_image(...) to generate an embedding."
+ )
+ assert self.features is not None, "Features must exist if an image has been set."
+ return self.features
+
+ @property
+ def device(self) -> torch.device:
+ return self.model.device
+
+ def reset_image(self) -> None:
+ """Resets the currently set image."""
+ self.is_image_set = False
+ self.features = None
+ self.orig_h = None
+ self.orig_w = None
+ self.input_h = None
+ self.input_w = None
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4547e070da2f3ddc5bf2f466cb2242e6135c7dc3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/amg.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/amg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea84055a7dd3ad0d55096a3ea434080ea0151089
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/amg.py
@@ -0,0 +1,346 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+
+
+class MaskData:
+ """
+ A structure for storing masks and their related data in batched format.
+ Implements basic filtering and concatenation.
+ """
+
+ def __init__(self, **kwargs) -> None:
+ for v in kwargs.values():
+ assert isinstance(
+ v, (list, np.ndarray, torch.Tensor)
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
+ self._stats = dict(**kwargs)
+
+ def __setitem__(self, key: str, item: Any) -> None:
+ assert isinstance(
+ item, (list, np.ndarray, torch.Tensor)
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
+ self._stats[key] = item
+
+ def __delitem__(self, key: str) -> None:
+ del self._stats[key]
+
+ def __getitem__(self, key: str) -> Any:
+ return self._stats[key]
+
+ def items(self) -> ItemsView[str, Any]:
+ return self._stats.items()
+
+ def filter(self, keep: torch.Tensor) -> None:
+ for k, v in self._stats.items():
+ if v is None:
+ self._stats[k] = None
+ elif isinstance(v, torch.Tensor):
+ self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+ elif isinstance(v, np.ndarray):
+ self._stats[k] = v[keep.detach().cpu().numpy()]
+ elif isinstance(v, list) and keep.dtype == torch.bool:
+ self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+ elif isinstance(v, list):
+ self._stats[k] = [v[i] for i in keep]
+ else:
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+ def cat(self, new_stats: "MaskData") -> None:
+ for k, v in new_stats.items():
+ if k not in self._stats or self._stats[k] is None:
+ self._stats[k] = deepcopy(v)
+ elif isinstance(v, torch.Tensor):
+ self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+ elif isinstance(v, np.ndarray):
+ self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+ elif isinstance(v, list):
+ self._stats[k] = self._stats[k] + deepcopy(v)
+ else:
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+ def to_numpy(self) -> None:
+ for k, v in self._stats.items():
+ if isinstance(v, torch.Tensor):
+ self._stats[k] = v.detach().cpu().numpy()
+
+
+def is_box_near_crop_edge(
+ boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
+) -> torch.Tensor:
+ """Filter masks at the edge of a crop, but not at the edge of the original image."""
+ crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+ orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+ boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+ near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+ near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+ near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+ return torch.any(near_crop_edge, dim=1)
+
+
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+ box_xywh = deepcopy(box_xyxy)
+ box_xywh[2] = box_xywh[2] - box_xywh[0]
+ box_xywh[3] = box_xywh[3] - box_xywh[1]
+ return box_xywh
+
+
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+ assert len(args) > 0 and all(
+ len(a) == len(args[0]) for a in args
+ ), "Batched iteration must have inputs of all the same size."
+ n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
+ for b in range(n_batches):
+ yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
+
+
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+ """
+ Encodes masks to an uncompressed RLE, in the format expected by
+ pycoco tools.
+ """
+ # Put in fortran order and flatten h,w
+ b, h, w = tensor.shape
+ tensor = tensor.permute(0, 2, 1).flatten(1)
+
+ # Compute change indices
+ diff = tensor[:, 1:] ^ tensor[:, :-1]
+ change_indices = diff.nonzero()
+
+ # Encode run length
+ out = []
+ for i in range(b):
+ cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+ cur_idxs = torch.cat(
+ [
+ torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+ cur_idxs + 1,
+ torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
+ ]
+ )
+ btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+ counts = [] if tensor[i, 0] == 0 else [0]
+ counts.extend(btw_idxs.detach().cpu().tolist())
+ out.append({"size": [h, w], "counts": counts})
+ return out
+
+
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+ """Compute a binary mask from an uncompressed RLE."""
+ h, w = rle["size"]
+ mask = np.empty(h * w, dtype=bool)
+ idx = 0
+ parity = False
+ for count in rle["counts"]:
+ mask[idx : idx + count] = parity
+ idx += count
+ parity ^= True
+ mask = mask.reshape(w, h)
+ return mask.transpose() # Put in C order
+
+
+def area_from_rle(rle: Dict[str, Any]) -> int:
+ return sum(rle["counts"][1::2])
+
+
+def calculate_stability_score(
+ masks: torch.Tensor, mask_threshold: float, threshold_offset: float
+) -> torch.Tensor:
+ """
+ Computes the stability score for a batch of masks. The stability
+ score is the IoU between the binary masks obtained by thresholding
+ the predicted mask logits at high and low values.
+ """
+ # One mask is always contained inside the other.
+ # Save memory by preventing unnecessary cast to torch.int64
+ intersections = (
+ (masks > (mask_threshold + threshold_offset))
+ .sum(-1, dtype=torch.int16)
+ .sum(-1, dtype=torch.int32)
+ )
+ unions = (
+ (masks > (mask_threshold - threshold_offset))
+ .sum(-1, dtype=torch.int16)
+ .sum(-1, dtype=torch.int32)
+ )
+ return intersections / unions
+
+
+def build_point_grid(n_per_side: int) -> np.ndarray:
+ """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+ offset = 1 / (2 * n_per_side)
+ points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+ points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+ points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+ points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+ return points
+
+
+def build_all_layer_point_grids(
+ n_per_side: int, n_layers: int, scale_per_layer: int
+) -> List[np.ndarray]:
+ """Generates point grids for all crop layers."""
+ points_by_layer = []
+ for i in range(n_layers + 1):
+ n_points = int(n_per_side / (scale_per_layer**i))
+ points_by_layer.append(build_point_grid(n_points))
+ return points_by_layer
+
+
+def generate_crop_boxes(
+ im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
+) -> Tuple[List[List[int]], List[int]]:
+ """
+ Generates a list of crop boxes of different sizes. Each layer
+ has (2**i)**2 boxes for the ith layer.
+ """
+ crop_boxes, layer_idxs = [], []
+ im_h, im_w = im_size
+ short_side = min(im_h, im_w)
+
+ # Original image
+ crop_boxes.append([0, 0, im_w, im_h])
+ layer_idxs.append(0)
+
+ def crop_len(orig_len, n_crops, overlap):
+ return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+
+ for i_layer in range(n_layers):
+ n_crops_per_side = 2 ** (i_layer + 1)
+ overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+ crop_w = crop_len(im_w, n_crops_per_side, overlap)
+ crop_h = crop_len(im_h, n_crops_per_side, overlap)
+
+ crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
+ crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
+
+ # Crops in XYWH format
+ for x0, y0 in product(crop_box_x0, crop_box_y0):
+ box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+ crop_boxes.append(box)
+ layer_idxs.append(i_layer + 1)
+
+ return crop_boxes, layer_idxs
+
+
+def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+ x0, y0, _, _ = crop_box
+ offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+ # Check if boxes has a channel dimension
+ if len(boxes.shape) == 3:
+ offset = offset.unsqueeze(1)
+ return boxes + offset
+
+
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+ x0, y0, _, _ = crop_box
+ offset = torch.tensor([[x0, y0]], device=points.device)
+ # Check if points has a channel dimension
+ if len(points.shape) == 3:
+ offset = offset.unsqueeze(1)
+ return points + offset
+
+
+def uncrop_masks(
+ masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
+) -> torch.Tensor:
+ x0, y0, x1, y1 = crop_box
+ if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+ return masks
+ # Coordinate transform masks
+ pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+ pad = (x0, pad_x - x0, y0, pad_y - y0)
+ return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def remove_small_regions(
+ mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+ """
+ Removes small disconnected regions and holes in a mask. Returns the
+ mask and an indicator of if the mask has been modified.
+ """
+ import cv2 # type: ignore
+
+ assert mode in ["holes", "islands"]
+ correct_holes = mode == "holes"
+ working_mask = (correct_holes ^ mask).astype(np.uint8)
+ n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+ sizes = stats[:, -1][1:] # Row 0 is background label
+ small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+ if len(small_regions) == 0:
+ return mask, False
+ fill_labels = [0] + small_regions
+ if not correct_holes:
+ fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+ # If every region is below threshold, keep largest
+ if len(fill_labels) == 0:
+ fill_labels = [int(np.argmax(sizes)) + 1]
+ mask = np.isin(regions, fill_labels)
+ return mask, True
+
+
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+ from custom_pycocotools import mask as mask_utils # type: ignore
+
+ h, w = uncompressed_rle["size"]
+ rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+ rle["counts"] = rle["counts"].decode("utf-8") # Necessary to serialize with json
+ return rle
+
+
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+ """
+ Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
+ an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
+ """
+ # torch.max below raises an error on empty inputs, just skip in this case
+ if torch.numel(masks) == 0:
+ return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+ # Normalize shape to CxHxW
+ shape = masks.shape
+ h, w = shape[-2:]
+ if len(shape) > 2:
+ masks = masks.flatten(0, -3)
+ else:
+ masks = masks.unsqueeze(0)
+
+ # Get top and bottom edges
+ in_height, _ = torch.max(masks, dim=-1)
+ in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
+ bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+ in_height_coords = in_height_coords + h * (~in_height)
+ top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+ # Get left and right edges
+ in_width, _ = torch.max(masks, dim=-2)
+ in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
+ right_edges, _ = torch.max(in_width_coords, dim=-1)
+ in_width_coords = in_width_coords + w * (~in_width)
+ left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+ # If the mask is empty the right edge will be to the left of the left edge.
+ # Replace these boxes with [0, 0, 0, 0]
+ empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+ out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+ out = out * (~empty_filter).unsqueeze(-1)
+
+ # Return to original shape
+ if len(shape) > 2:
+ out = out.reshape(*shape[:-2], 4)
+ else:
+ out = out[0]
+
+ return out
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/onnx.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a9d9e2f1c5990f6b279ef7d1bb847063c68e5e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/onnx.py
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from typing import Tuple
+
+from ..modeling import Sam
+from .amg import calculate_stability_score
+
+
+class SamOnnxModel(nn.Module):
+ """
+ This model should not be called directly, but is used in ONNX export.
+ It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
+ with some functions modified to enable model tracing. Also supports extra
+ options controlling what information. See the ONNX export script for details.
+ """
+
+ def __init__(
+ self,
+ model: Sam,
+ return_single_mask: bool,
+ use_stability_score: bool = False,
+ return_extra_metrics: bool = False,
+ ) -> None:
+ super().__init__()
+ self.mask_decoder = model.mask_decoder
+ self.model = model
+ self.img_size = model.image_encoder.img_size
+ self.return_single_mask = return_single_mask
+ self.use_stability_score = use_stability_score
+ self.stability_score_offset = 1.0
+ self.return_extra_metrics = return_extra_metrics
+
+ @staticmethod
+ def resize_longest_image_size(
+ input_image_size: torch.Tensor, longest_side: int
+ ) -> torch.Tensor:
+ input_image_size = input_image_size.to(torch.float32)
+ scale = longest_side / torch.max(input_image_size)
+ transformed_size = scale * input_image_size
+ transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
+ return transformed_size
+
+ def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
+ point_coords = point_coords + 0.5
+ point_coords = point_coords / self.img_size
+ point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
+ point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
+
+ point_embedding = point_embedding * (point_labels != -1)
+ point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
+ point_labels == -1
+ )
+
+ for i in range(self.model.prompt_encoder.num_point_embeddings):
+ point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
+ i
+ ].weight * (point_labels == i)
+
+ return point_embedding
+
+ def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
+ mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
+ mask_embedding = mask_embedding + (
+ 1 - has_mask_input
+ ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
+ return mask_embedding
+
+ def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
+ masks = F.interpolate(
+ masks,
+ size=(self.img_size, self.img_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+
+ prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to(torch.int64)
+ masks = masks[..., : prepadded_size[0], : prepadded_size[1]] # type: ignore
+
+ orig_im_size = orig_im_size.to(torch.int64)
+ h, w = orig_im_size[0], orig_im_size[1]
+ masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
+ return masks
+
+ def select_masks(
+ self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ # Determine if we should return the multiclick mask or not from the number of points.
+ # The reweighting is used to avoid control flow.
+ score_reweight = torch.tensor(
+ [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
+ ).to(iou_preds.device)
+ score = iou_preds + (num_points - 2.5) * score_reweight
+ best_idx = torch.argmax(score, dim=1)
+ masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
+ iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
+
+ return masks, iou_preds
+
+ @torch.no_grad()
+ def forward(
+ self,
+ image_embeddings: torch.Tensor,
+ point_coords: torch.Tensor,
+ point_labels: torch.Tensor,
+ mask_input: torch.Tensor,
+ has_mask_input: torch.Tensor,
+ orig_im_size: torch.Tensor,
+ ):
+ sparse_embedding = self._embed_points(point_coords, point_labels)
+ dense_embedding = self._embed_masks(mask_input, has_mask_input)
+
+ masks, scores = self.model.mask_decoder.predict_masks(
+ image_embeddings=image_embeddings,
+ image_pe=self.model.prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embedding,
+ dense_prompt_embeddings=dense_embedding,
+ )
+
+ if self.use_stability_score:
+ scores = calculate_stability_score(
+ masks, self.model.mask_threshold, self.stability_score_offset
+ )
+
+ if self.return_single_mask:
+ masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
+
+ upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
+
+ if self.return_extra_metrics:
+ stability_scores = calculate_stability_score(
+ upscaled_masks, self.model.mask_threshold, self.stability_score_offset
+ )
+ areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
+ return upscaled_masks, scores, stability_scores, areas, masks
+
+ return upscaled_masks, scores, masks
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/transforms.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f07693952bbffcd23c5226255d1f649476ca7ce6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/sam/utils/transforms.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize, to_pil_image # type: ignore
+
+from copy import deepcopy
+from typing import Tuple
+
+
+class ResizeLongestSide:
+ """
+ Resizes images to the longest side 'target_length', as well as provides
+ methods for resizing coordinates and boxes. Provides methods for
+ transforming both numpy array and batched torch tensors.
+ """
+
+ def __init__(self, target_length: int) -> None:
+ self.target_length = target_length
+
+ def apply_image(self, image: np.ndarray) -> np.ndarray:
+ """
+ Expects a numpy array with shape HxWxC in uint8 format.
+ """
+ target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+ return np.array(resize(to_pil_image(image), target_size))
+
+ def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+ """
+ Expects a numpy array of length 2 in the final dimension. Requires the
+ original image size in (H, W) format.
+ """
+ old_h, old_w = original_size
+ new_h, new_w = self.get_preprocess_shape(
+ original_size[0], original_size[1], self.target_length
+ )
+ coords = deepcopy(coords).astype(float)
+ coords[..., 0] = coords[..., 0] * (new_w / old_w)
+ coords[..., 1] = coords[..., 1] * (new_h / old_h)
+ return coords
+
+ def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+ """
+ Expects a numpy array shape Bx4. Requires the original image size
+ in (H, W) format.
+ """
+ boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+ return boxes.reshape(-1, 4)
+
+ def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+ """
+ Expects batched images with shape BxCxHxW and float format. This
+ transformation may not exactly match apply_image. apply_image is
+ the transformation expected by the model.
+ """
+ # Expects an image in BCHW format. May not exactly match apply_image.
+ target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
+ return F.interpolate(
+ image, target_size, mode="bilinear", align_corners=False, antialias=True
+ )
+
+ def apply_coords_torch(
+ self, coords: torch.Tensor, original_size: Tuple[int, ...]
+ ) -> torch.Tensor:
+ """
+ Expects a torch tensor with length 2 in the last dimension. Requires the
+ original image size in (H, W) format.
+ """
+ old_h, old_w = original_size
+ new_h, new_w = self.get_preprocess_shape(
+ original_size[0], original_size[1], self.target_length
+ )
+ coords = deepcopy(coords).to(torch.float)
+ coords[..., 0] = coords[..., 0] * (new_w / old_w)
+ coords[..., 1] = coords[..., 1] * (new_h / old_h)
+ return coords
+
+ def apply_boxes_torch(
+ self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+ ) -> torch.Tensor:
+ """
+ Expects a torch tensor with shape Bx4. Requires the original image
+ size in (H, W) format.
+ """
+ boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+ return boxes.reshape(-1, 4)
+
+ @staticmethod
+ def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+ """
+ Compute the output size given input size and target long side length.
+ """
+ scale = long_side_length * 1.0 / max(oldh, oldw)
+ newh, neww = oldh * scale, oldw * scale
+ neww = int(neww + 0.5)
+ newh = int(newh + 0.5)
+ return (newh, neww)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/scribble/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/scribble/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5aff80df8eda472c68f29622392c8cb3d993fcf
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/scribble/__init__.py
@@ -0,0 +1,41 @@
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, HWC3
+
+#Not to be confused with "scribble" from HED. That is "fake scribble" which is more accurate and less picky than this.
+class ScribbleDetector:
+ def __call__(self, input_image=None, detect_resolution=512, output_type=None, upscale_method="INTER_AREA", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ detected_map = np.zeros_like(input_image, dtype=np.uint8)
+ detected_map[np.min(input_image, axis=2) < 127] = 255
+ detected_map = 255 - detected_map
+
+ detected_map = remove_pad(detected_map)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
+class ScribbleXDog_Detector:
+ def __call__(self, input_image=None, detect_resolution=512, thr_a=32, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ g1 = cv2.GaussianBlur(input_image.astype(np.float32), (0, 0), 0.5)
+ g2 = cv2.GaussianBlur(input_image.astype(np.float32), (0, 0), 5.0)
+ dog = (255 - np.min(g2 - g1, axis=2)).clip(0, 255).astype(np.uint8)
+ result = np.zeros_like(input_image, dtype=np.uint8)
+ result[2 * (255 - dog) > thr_a] = 255
+ #result = 255 - result
+
+ detected_map = HWC3(remove_pad(result))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/shuffle/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/shuffle/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db9e6fb85c08392c6b6816bd3eea91733841321f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/shuffle/__init__.py
@@ -0,0 +1,87 @@
+import warnings
+
+import cv2
+import numpy as np
+from PIL import Image
+import random
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, img2mask, make_noise_disk, resize_image_with_pad
+
+
+class ContentShuffleDetector:
+ def __call__(self, input_image, h=None, w=None, f=None, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", seed=-1, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ H, W, C = input_image.shape
+ if h is None:
+ h = H
+ if w is None:
+ w = W
+ if f is None:
+ f = 256
+ rng = np.random.default_rng(seed) if seed else None
+ x = make_noise_disk(h, w, 1, f, rng=rng) * float(W - 1)
+ y = make_noise_disk(h, w, 1, f, rng=rng) * float(H - 1)
+ flow = np.concatenate([x, y], axis=2).astype(np.float32)
+ detected_map = cv2.remap(input_image, flow, None, cv2.INTER_LINEAR)
+ detected_map = remove_pad(detected_map)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
+
+class ColorShuffleDetector:
+ def __call__(self, img):
+ H, W, C = img.shape
+ F = np.random.randint(64, 384)
+ A = make_noise_disk(H, W, 3, F)
+ B = make_noise_disk(H, W, 3, F)
+ C = (A + B) / 2.0
+ A = (C + (A - C) * 3.0).clip(0, 1)
+ B = (C + (B - C) * 3.0).clip(0, 1)
+ L = img.astype(np.float32) / 255.0
+ Y = A * L + B * (1 - L)
+ Y -= np.min(Y, axis=(0, 1), keepdims=True)
+ Y /= np.maximum(np.max(Y, axis=(0, 1), keepdims=True), 1e-5)
+ Y *= 255.0
+ return Y.clip(0, 255).astype(np.uint8)
+
+
+class GrayDetector:
+ def __call__(self, img):
+ eps = 1e-5
+ X = img.astype(np.float32)
+ r, g, b = X[:, :, 0], X[:, :, 1], X[:, :, 2]
+ kr, kg, kb = [random.random() + eps for _ in range(3)]
+ ks = kr + kg + kb
+ kr /= ks
+ kg /= ks
+ kb /= ks
+ Y = r * kr + g * kg + b * kb
+ Y = np.stack([Y] * 3, axis=2)
+ return Y.clip(0, 255).astype(np.uint8)
+
+
+class DownSampleDetector:
+ def __call__(self, img, level=3, k=16.0):
+ h = img.astype(np.float32)
+ for _ in range(level):
+ h += np.random.normal(loc=0.0, scale=k, size=h.shape)
+ h = cv2.pyrDown(h)
+ for _ in range(level):
+ h = cv2.pyrUp(h)
+ h += np.random.normal(loc=0.0, scale=k, size=h.shape)
+ return h.clip(0, 255).astype(np.uint8)
+
+
+class Image2MaskShuffleDetector:
+ def __init__(self, resolution=(640, 512)):
+ self.H, self.W = resolution
+
+ def __call__(self, img):
+ m = img2mask(img, self.H, self.W)
+ m *= 255.0
+ return m.clip(0, 255).astype(np.uint8)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Fmish.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Fmish.py
new file mode 100644
index 0000000000000000000000000000000000000000..e015d2952b208da35d9a053ef115b92e7c08528b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Fmish.py
@@ -0,0 +1,17 @@
+"""
+Script provides functional interface for Mish activation function.
+"""
+
+# import pytorch
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def mish(input):
+ """
+ Applies the mish function element-wise:
+ mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+ See additional documentation for mish class.
+ """
+ return input * torch.tanh(F.softplus(input))
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Fsmish.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Fsmish.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1f5ff99b53451fadce54b88d2ba46952ba3bb7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Fsmish.py
@@ -0,0 +1,20 @@
+"""
+Script based on:
+Wang, Xueliang, Honge Ren, and Achuan Wang.
+ "Smish: A Novel Activation Function for Deep Learning Methods.
+ " Electronics 11.4 (2022): 540.
+"""
+
+# import pytorch
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def smish(input):
+ """
+ Applies the mish function element-wise:
+ mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(sigmoid(x))))
+ See additional documentation for mish class.
+ """
+ return input * torch.tanh(torch.log(1+torch.sigmoid(input)))
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/LICENSE.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4046cc58457b8c5b4ac7d4b4dadf18630bf1c9b4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Xavier Soria Poma
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Xmish.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Xmish.py
new file mode 100644
index 0000000000000000000000000000000000000000..f783519bbb58a65049a98a5ccdecad2b65379b0d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Xmish.py
@@ -0,0 +1,43 @@
+"""
+Applies the mish function element-wise:
+mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+"""
+
+# import pytorch
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+# import activation functions
+from .Fmish import mish
+
+
+class Mish(nn.Module):
+ """
+ Applies the mish function element-wise:
+ mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+ Shape:
+ - Input: (N, *) where * means, any number of additional
+ dimensions
+ - Output: (N, *), same shape as the input
+ Examples:
+ >>> m = Mish()
+ >>> input = torch.randn(2)
+ >>> output = m(input)
+ Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html
+ """
+
+ def __init__(self):
+ """
+ Init method.
+ """
+ super().__init__()
+
+ def forward(self, input):
+ """
+ Forward pass of the function.
+ """
+ if torch.__version__ >= "1.9":
+ return F.mish(input)
+ else:
+ return mish(input)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Xsmish.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Xsmish.py
new file mode 100644
index 0000000000000000000000000000000000000000..760861f034923356dbce20687fee5cfbde6830d3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/Xsmish.py
@@ -0,0 +1,43 @@
+"""
+Script based on:
+Wang, Xueliang, Honge Ren, and Achuan Wang.
+ "Smish: A Novel Activation Function for Deep Learning Methods.
+ " Electronics 11.4 (2022): 540.
+smish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + sigmoid(x)))
+"""
+
+# import pytorch
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+# import activation functions
+from .Fsmish import smish
+
+
+class Smish(nn.Module):
+ """
+ Applies the mish function element-wise:
+ mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+ Shape:
+ - Input: (N, *) where * means, any number of additional
+ dimensions
+ - Output: (N, *), same shape as the input
+ Examples:
+ >>> m = Mish()
+ >>> input = torch.randn(2)
+ >>> output = m(input)
+ Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html
+ """
+
+ def __init__(self):
+ """
+ Init method.
+ """
+ super().__init__()
+
+ def forward(self, input):
+ """
+ Forward pass of the function.
+ """
+ return smish(input)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f83ca70195afffafdab963bfee6c377670fc73
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/__init__.py
@@ -0,0 +1,58 @@
+"""
+Hello, welcome on board,
+"""
+from __future__ import print_function
+
+import os
+import cv2
+import numpy as np
+
+import torch
+
+from .ted import TED # TEED architecture
+from einops import rearrange
+from custom_controlnet_aux.util import safe_step, custom_hf_download, BDS_MODEL_NAME, common_input_validate, resize_image_with_pad, HWC3
+from PIL import Image
+
+
+class TEDDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=BDS_MODEL_NAME, filename="7_model.pth", subfolder="Annotators"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder=subfolder)
+ model = TED()
+ model.load_state_dict(torch.load(model_path, map_location='cpu'))
+ model.eval()
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+
+ def __call__(self, input_image, detect_resolution=512, safe_steps=2, upscale_method="INTER_CUBIC", output_type="pil", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ H, W, _ = input_image.shape
+ with torch.no_grad():
+ image_teed = torch.from_numpy(input_image.copy()).float().to(self.device)
+ image_teed = rearrange(image_teed, 'h w c -> 1 c h w')
+ edges = self.model(image_teed)
+ edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
+ edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
+ edges = np.stack(edges, axis=2)
+ edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+ if safe_steps != 0:
+ edge = safe_step(edge, safe_steps)
+ edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = remove_pad(HWC3(edge))
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map[..., :3])
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/ted.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/ted.py
new file mode 100644
index 0000000000000000000000000000000000000000..be799ed5ceb6ab2ab55ac0943afed610e13a5cf3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/teed/ted.py
@@ -0,0 +1,296 @@
+# TEED: is a Tiny but Efficient Edge Detection, it comes from the LDC-B3
+# with a Slightly modification
+# LDC parameters:
+# 155665
+# TED > 58K
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .Fsmish import smish as Fsmish
+from .Xsmish import Smish
+
+
+def weight_init(m):
+ if isinstance(m, (nn.Conv2d,)):
+ torch.nn.init.xavier_normal_(m.weight, gain=1.0)
+
+ if m.bias is not None:
+ torch.nn.init.zeros_(m.bias)
+
+ # for fusion layer
+ if isinstance(m, (nn.ConvTranspose2d,)):
+ torch.nn.init.xavier_normal_(m.weight, gain=1.0)
+ if m.bias is not None:
+ torch.nn.init.zeros_(m.bias)
+
+class CoFusion(nn.Module):
+ # from LDC
+
+ def __init__(self, in_ch, out_ch):
+ super(CoFusion, self).__init__()
+ self.conv1 = nn.Conv2d(in_ch, 32, kernel_size=3,
+ stride=1, padding=1) # before 64
+ self.conv3= nn.Conv2d(32, out_ch, kernel_size=3,
+ stride=1, padding=1)# before 64 instead of 32
+ self.relu = nn.ReLU()
+ self.norm_layer1 = nn.GroupNorm(4, 32) # before 64
+
+ def forward(self, x):
+ # fusecat = torch.cat(x, dim=1)
+ attn = self.relu(self.norm_layer1(self.conv1(x)))
+ attn = F.softmax(self.conv3(attn), dim=1)
+ return ((x * attn).sum(1)).unsqueeze(1)
+
+
+class CoFusion2(nn.Module):
+ # TEDv14-3
+ def __init__(self, in_ch, out_ch):
+ super(CoFusion2, self).__init__()
+ self.conv1 = nn.Conv2d(in_ch, 32, kernel_size=3,
+ stride=1, padding=1) # before 64
+ # self.conv2 = nn.Conv2d(32, 32, kernel_size=3,
+ # stride=1, padding=1)# before 64
+ self.conv3 = nn.Conv2d(32, out_ch, kernel_size=3,
+ stride=1, padding=1)# before 64 instead of 32
+ self.smish= Smish()#nn.ReLU(inplace=True)
+
+
+ def forward(self, x):
+ # fusecat = torch.cat(x, dim=1)
+ attn = self.conv1(self.smish(x))
+ attn = self.conv3(self.smish(attn)) # before , )dim=1)
+
+ # return ((fusecat * attn).sum(1)).unsqueeze(1)
+ return ((x * attn).sum(1)).unsqueeze(1)
+
+class DoubleFusion(nn.Module):
+ # TED fusion before the final edge map prediction
+ def __init__(self, in_ch, out_ch):
+ super(DoubleFusion, self).__init__()
+ self.DWconv1 = nn.Conv2d(in_ch, in_ch*8, kernel_size=3,
+ stride=1, padding=1, groups=in_ch) # before 64
+ self.PSconv1 = nn.PixelShuffle(1)
+
+ self.DWconv2 = nn.Conv2d(24, 24*1, kernel_size=3,
+ stride=1, padding=1,groups=24)# before 64 instead of 32
+
+ self.AF= Smish()#XAF() #nn.Tanh()# XAF() # # Smish()#
+
+
+ def forward(self, x):
+ # fusecat = torch.cat(x, dim=1)
+ attn = self.PSconv1(self.DWconv1(self.AF(x))) # #TEED best res TEDv14 [8, 32, 352, 352]
+
+ attn2 = self.PSconv1(self.DWconv2(self.AF(attn))) # #TEED best res TEDv14[8, 3, 352, 352]
+
+ return Fsmish(((attn2 +attn).sum(1)).unsqueeze(1)) #TED best res
+
+class _DenseLayer(nn.Sequential):
+ def __init__(self, input_features, out_features):
+ super(_DenseLayer, self).__init__()
+
+ self.add_module('conv1', nn.Conv2d(input_features, out_features,
+ kernel_size=3, stride=1, padding=2, bias=True)),
+ self.add_module('smish1', Smish()),
+ self.add_module('conv2', nn.Conv2d(out_features, out_features,
+ kernel_size=3, stride=1, bias=True))
+ def forward(self, x):
+ x1, x2 = x
+
+ new_features = super(_DenseLayer, self).forward(Fsmish(x1)) # F.relu()
+
+ return 0.5 * (new_features + x2), x2
+
+
+class _DenseBlock(nn.Sequential):
+ def __init__(self, num_layers, input_features, out_features):
+ super(_DenseBlock, self).__init__()
+ for i in range(num_layers):
+ layer = _DenseLayer(input_features, out_features)
+ self.add_module('denselayer%d' % (i + 1), layer)
+ input_features = out_features
+
+
+class UpConvBlock(nn.Module):
+ def __init__(self, in_features, up_scale):
+ super(UpConvBlock, self).__init__()
+ self.up_factor = 2
+ self.constant_features = 16
+
+ layers = self.make_deconv_layers(in_features, up_scale)
+ assert layers is not None, layers
+ self.features = nn.Sequential(*layers)
+
+ def make_deconv_layers(self, in_features, up_scale):
+ layers = []
+ all_pads=[0,0,1,3,7]
+ for i in range(up_scale):
+ kernel_size = 2 ** up_scale
+ pad = all_pads[up_scale] # kernel_size-1
+ out_features = self.compute_out_features(i, up_scale)
+ layers.append(nn.Conv2d(in_features, out_features, 1))
+ layers.append(Smish())
+ layers.append(nn.ConvTranspose2d(
+ out_features, out_features, kernel_size, stride=2, padding=pad))
+ in_features = out_features
+ return layers
+
+ def compute_out_features(self, idx, up_scale):
+ return 1 if idx == up_scale - 1 else self.constant_features
+
+ def forward(self, x):
+ return self.features(x)
+
+
+class SingleConvBlock(nn.Module):
+ def __init__(self, in_features, out_features, stride, use_ac=False):
+ super(SingleConvBlock, self).__init__()
+ # self.use_bn = use_bs
+ self.use_ac=use_ac
+ self.conv = nn.Conv2d(in_features, out_features, 1, stride=stride,
+ bias=True)
+ if self.use_ac:
+ self.smish = Smish()
+
+ def forward(self, x):
+ x = self.conv(x)
+ if self.use_ac:
+ return self.smish(x)
+ else:
+ return x
+
+class DoubleConvBlock(nn.Module):
+ def __init__(self, in_features, mid_features,
+ out_features=None,
+ stride=1,
+ use_act=True):
+ super(DoubleConvBlock, self).__init__()
+
+ self.use_act = use_act
+ if out_features is None:
+ out_features = mid_features
+ self.conv1 = nn.Conv2d(in_features, mid_features,
+ 3, padding=1, stride=stride)
+ self.conv2 = nn.Conv2d(mid_features, out_features, 3, padding=1)
+ self.smish= Smish()#nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.smish(x)
+ x = self.conv2(x)
+ if self.use_act:
+ x = self.smish(x)
+ return x
+
+
+class TED(nn.Module):
+ """ Definition of Tiny and Efficient Edge Detector
+ model
+ """
+
+ def __init__(self):
+ super(TED, self).__init__()
+ self.block_1 = DoubleConvBlock(3, 16, 16, stride=2,)
+ self.block_2 = DoubleConvBlock(16, 32, use_act=False)
+ self.dblock_3 = _DenseBlock(1, 32, 48) # [32,48,100,100] before (2, 32, 64)
+
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+ # skip1 connection, see fig. 2
+ self.side_1 = SingleConvBlock(16, 32, 2)
+
+ # skip2 connection, see fig. 2
+ self.pre_dense_3 = SingleConvBlock(32, 48, 1) # before (32, 64, 1)
+
+ # USNet
+ self.up_block_1 = UpConvBlock(16, 1)
+ self.up_block_2 = UpConvBlock(32, 1)
+ self.up_block_3 = UpConvBlock(48, 2) # (32, 64, 1)
+
+ self.block_cat = DoubleFusion(3,3) # TEED: DoubleFusion
+
+ self.apply(weight_init)
+
+ def slice(self, tensor, slice_shape):
+ t_shape = tensor.shape
+ img_h, img_w = slice_shape
+ if img_w!=t_shape[-1] or img_h!=t_shape[2]:
+ new_tensor = F.interpolate(
+ tensor, size=(img_h, img_w), mode='bicubic',align_corners=False)
+
+ else:
+ new_tensor=tensor
+ # tensor[..., :height, :width]
+ return new_tensor
+ def resize_input(self,tensor):
+ t_shape = tensor.shape
+ if t_shape[2] % 8 != 0 or t_shape[3] % 8 != 0:
+ img_w= ((t_shape[3]// 8) + 1) * 8
+ img_h = ((t_shape[2] // 8) + 1) * 8
+ new_tensor = F.interpolate(
+ tensor, size=(img_h, img_w), mode='bicubic', align_corners=False)
+ else:
+ new_tensor = tensor
+ return new_tensor
+
+ def crop_bdcn(data1, h, w, crop_h, crop_w):
+ # Based on BDCN Implementation @ https://github.com/pkuCactus/BDCN
+ _, _, h1, w1 = data1.size()
+ assert (h <= h1 and w <= w1)
+ data = data1[:, :, crop_h:crop_h + h, crop_w:crop_w + w]
+ return data
+
+
+ def forward(self, x, single_test=False):
+ assert x.ndim == 4, x.shape
+ # supose the image size is 352x352
+
+ # Block 1
+ block_1 = self.block_1(x) # [8,16,176,176]
+ block_1_side = self.side_1(block_1) # 16 [8,32,88,88]
+
+ # Block 2
+ block_2 = self.block_2(block_1) # 32 # [8,32,176,176]
+ block_2_down = self.maxpool(block_2) # [8,32,88,88]
+ block_2_add = block_2_down + block_1_side # [8,32,88,88]
+
+ # Block 3
+ block_3_pre_dense = self.pre_dense_3(block_2_down) # [8,64,88,88] block 3 L connection
+ block_3, _ = self.dblock_3([block_2_add, block_3_pre_dense]) # [8,64,88,88]
+
+ # upsampling blocks
+ out_1 = self.up_block_1(block_1)
+ out_2 = self.up_block_2(block_2)
+ out_3 = self.up_block_3(block_3)
+
+ results = [out_1, out_2, out_3]
+
+ # concatenate multiscale outputs
+ block_cat = torch.cat(results, dim=1) # Bx6xHxW
+ block_cat = self.block_cat(block_cat) # Bx1xHxW DoubleFusion
+
+ results.append(block_cat)
+ return results
+
+
+if __name__ == '__main__':
+ batch_size = 8
+ img_height = 352
+ img_width = 352
+
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
+ device = "cpu"
+ input = torch.rand(batch_size, 3, img_height, img_width).to(device)
+ # target = torch.rand(batch_size, 1, img_height, img_width).to(device)
+ print(f"input shape: {input.shape}")
+ model = TED().to(device)
+ output = model(input)
+ print(f"output shapes: {[t.shape for t in output]}")
+
+ # for i in range(20000):
+ # print(i)
+ # output = model(input)
+ # loss = nn.MSELoss()(output[-1], target)
+ # loss.backward()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/requirements.txt b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_image.png b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4a751e31da45af83c8a3d5ec02cf8c22c7bb8e9
Binary files /dev/null and b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_image.png differ
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_processor.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d48775e1d29b8813eb5940c88d3527cb79270e07
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_processor.py
@@ -0,0 +1,95 @@
+"""Test the Processor class."""
+import unittest
+from PIL import Image
+
+from custom_controlnet_aux.processor import Processor
+
+
+class TestProcessor(unittest.TestCase):
+ def test_hed(self):
+ processor = Processor('hed')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_midas(self):
+ processor = Processor('midas')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_mlsd(self):
+ processor = Processor('mlsd')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_openpose(self):
+ processor = Processor('openpose')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_pidinet(self):
+ processor = Processor('pidinet')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_normalbae(self):
+ processor = Processor('normalbae')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_lineart(self):
+ processor = Processor('lineart')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_lineart_coarse(self):
+ processor = Processor('lineart_coarse')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_lineart_anime(self):
+ processor = Processor('lineart_anime')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_canny(self):
+ processor = Processor('canny')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_content_shuffle(self):
+ processor = Processor('content_shuffle')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_zoe(self):
+ processor = Processor('zoe')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_mediapipe_face(self):
+ processor = Processor('mediapipe_face')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+ def test_tile(self):
+ processor = Processor('tile')
+ image = Image.open('test_image.png')
+ processed_image = processor(image)
+ self.assertIsInstance(processed_image, bytes)
+
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_processor_pytest.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_processor_pytest.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74dd4a95abab8bab3dc34af2142b21d46bd0d9d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/tests/test_processor_pytest.py
@@ -0,0 +1,78 @@
+import io
+
+import numpy as np
+import pytest
+from PIL import Image
+
+from custom_controlnet_aux.processor import MODELS, Processor
+
+
+@pytest.fixture(params=[
+ 'scribble_hed',
+ 'softedge_hed',
+ 'scribble_hedsafe',
+ 'softedge_hedsafe',
+ 'depth_midas',
+ 'mlsd',
+ 'openpose',
+ 'openpose_hand',
+ 'openpose_face',
+ 'openpose_faceonly',
+ 'openpose_full',
+ 'scribble_pidinet',
+ 'softedge_pidinet',
+ 'scribble_pidsafe',
+ 'softedge_pidsafe',
+ 'normal_bae',
+ 'lineart_coarse',
+ 'lineart_realistic',
+ 'lineart_anime',
+ 'canny',
+ 'shuffle',
+ 'depth_zoe',
+ 'depth_leres',
+ 'depth_leres++',
+ 'mediapipe_face',
+ 'tile'
+])
+def processor(request):
+ return Processor(request.param)
+
+
+def test_processor_init(processor):
+ assert isinstance(processor.processor, MODELS[processor.processor_id]['class'])
+ assert isinstance(processor.params, dict)
+
+
+def test_processor_call(processor):
+ # Load test image
+ with open('test_image.png', 'rb') as f:
+ image_bytes = f.read()
+ image = Image.open(io.BytesIO(image_bytes))
+
+ # Output size
+ resolution = 512
+ W, H = image.size
+ H = float(H)
+ W = float(W)
+ k = float(resolution) / min(H, W)
+ H *= k
+ W *= k
+ H = int(np.round(H / 64.0)) * 64
+ W = int(np.round(W / 64.0)) * 64
+
+ # Test processing
+ processed_image = processor(image)
+ assert isinstance(processed_image, Image.Image)
+ assert processed_image.size == (W, H)
+
+
+def test_processor_call_bytes(processor):
+ # Load test image
+ with open('test_image.png', 'rb') as f:
+ image_bytes = f.read()
+
+ # Test processing
+ processed_image_bytes = processor(image_bytes, to_pil=False)
+ assert isinstance(processed_image_bytes, bytes)
+ assert len(processed_image_bytes) > 0
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/tile/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/tile/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f91fe2696d836769cd3d2531fddaa07cf5579e29
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/tile/__init__.py
@@ -0,0 +1,82 @@
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import get_upscale_method, common_input_validate, HWC3
+from .guided_filter import FastGuidedFilter
+
+class TileDetector:
+ def __call__(self, input_image=None, pyrUp_iters=3, output_type=None, upscale_method="INTER_AREA", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ H, W, _ = input_image.shape
+ H = int(np.round(H / 64.0)) * 64
+ W = int(np.round(W / 64.0)) * 64
+ detected_map = cv2.resize(input_image, (W // (2 ** pyrUp_iters), H // (2 ** pyrUp_iters)),
+ interpolation=get_upscale_method(upscale_method))
+ detected_map = HWC3(detected_map)
+
+ for _ in range(pyrUp_iters):
+ detected_map = cv2.pyrUp(detected_map)
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
+
+# Source: https://huggingface.co/TTPlanet/TTPLanet_SDXL_Controlnet_Tile_Realistic/blob/main/TTP_tile_preprocessor_v5.py
+
+def apply_gaussian_blur(image_np, ksize=5, sigmaX=1.0):
+ if ksize % 2 == 0:
+ ksize += 1 # ksize must be odd
+ blurred_image = cv2.GaussianBlur(image_np, (ksize, ksize), sigmaX=sigmaX)
+ return blurred_image
+
+def apply_guided_filter(image_np, radius, eps, scale):
+ filter = FastGuidedFilter(image_np, radius, eps, scale)
+ return filter.filter(image_np)
+
+class TTPlanet_Tile_Detector_GF:
+ def __call__(self, input_image, scale_factor, blur_strength, radius, eps, output_type=None, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ img_np = input_image[:, :, ::-1] # RGB to BGR
+
+ # Apply Gaussian blur
+ img_np = apply_gaussian_blur(img_np, ksize=int(blur_strength), sigmaX=blur_strength / 2)
+
+ # Apply Guided Filter
+ img_np = apply_guided_filter(img_np, radius, eps, scale_factor)
+
+ # Resize image
+ height, width = img_np.shape[:2]
+ new_width = int(width / scale_factor)
+ new_height = int(height / scale_factor)
+ resized_down = cv2.resize(img_np, (new_width, new_height), interpolation=cv2.INTER_AREA)
+ resized_img = cv2.resize(resized_down, (width, height), interpolation=cv2.INTER_CUBIC)
+ detected_map = HWC3(resized_img[:, :, ::-1]) # BGR to RGB
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
+class TTPLanet_Tile_Detector_Simple:
+ def __call__(self, input_image, scale_factor, blur_strength, output_type=None, **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ img_np = input_image[:, :, ::-1] # RGB to BGR
+
+ # Resize image first if you want blur to apply after resizing
+ height, width = img_np.shape[:2]
+ new_width = int(width / scale_factor)
+ new_height = int(height / scale_factor)
+ resized_down = cv2.resize(img_np, (new_width, new_height), interpolation=cv2.INTER_AREA)
+ resized_img = cv2.resize(resized_down, (width, height), interpolation=cv2.INTER_LANCZOS4)
+
+ # Apply Gaussian blur after resizing
+ img_np = apply_gaussian_blur(resized_img, ksize=int(blur_strength), sigmaX=blur_strength / 2)
+ detected_map = HWC3(img_np[:, :, ::-1]) # BGR to RGB
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/tile/guided_filter.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/tile/guided_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef079dcfd11be9609044143504dcd83a29e53e07
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/tile/guided_filter.py
@@ -0,0 +1,281 @@
+
+# -*- coding: utf-8 -*-
+## @package guided_filter.core.filters
+#
+# Implementation of guided filter.
+# * GuidedFilter: Original guided filter.
+# * FastGuidedFilter: Fast version of the guided filter.
+# @author tody
+# @date 2015/08/26
+
+import numpy as np
+import cv2
+
+## Convert image into float32 type.
+def to32F(img):
+ if img.dtype == np.float32:
+ return img
+ return (1.0 / 255.0) * np.float32(img)
+
+## Convert image into uint8 type.
+def to8U(img):
+ if img.dtype == np.uint8:
+ return img
+ return np.clip(np.uint8(255.0 * img), 0, 255)
+
+## Return if the input image is gray or not.
+def _isGray(I):
+ return len(I.shape) == 2
+
+
+## Return down sampled image.
+# @param scale (w/s, h/s) image will be created.
+# @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
+def _downSample(I, scale=4, shape=None):
+ if shape is not None:
+ h, w = shape
+ return cv2.resize(I, (w, h), interpolation=cv2.INTER_NEAREST)
+
+ h, w = I.shape[:2]
+ return cv2.resize(I, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_NEAREST)
+
+
+## Return up sampled image.
+# @param scale (w*s, h*s) image will be created.
+# @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
+def _upSample(I, scale=2, shape=None):
+ if shape is not None:
+ h, w = shape
+ return cv2.resize(I, (w, h), interpolation=cv2.INTER_LINEAR)
+
+ h, w = I.shape[:2]
+ return cv2.resize(I, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
+
+## Fast guide filter.
+class FastGuidedFilter:
+ ## Constructor.
+ # @param I Input guidance image. Color or gray.
+ # @param radius Radius of Guided Filter.
+ # @param epsilon Regularization term of Guided Filter.
+ # @param scale Down sampled scale.
+ def __init__(self, I, radius=5, epsilon=0.4, scale=4):
+ I_32F = to32F(I)
+ self._I = I_32F
+ h, w = I.shape[:2]
+
+ I_sub = _downSample(I_32F, scale)
+
+ self._I_sub = I_sub
+ radius = int(radius / scale)
+
+ if _isGray(I):
+ self._guided_filter = GuidedFilterGray(I_sub, radius, epsilon)
+ else:
+ self._guided_filter = GuidedFilterColor(I_sub, radius, epsilon)
+
+ ## Apply filter for the input image.
+ # @param p Input image for the filtering.
+ def filter(self, p):
+ p_32F = to32F(p)
+ shape_original = p.shape[:2]
+
+ p_sub = _downSample(p_32F, shape=self._I_sub.shape[:2])
+
+ if _isGray(p_sub):
+ return self._filterGray(p_sub, shape_original)
+
+ cs = p.shape[2]
+ q = np.array(p_32F)
+
+ for ci in range(cs):
+ q[:, :, ci] = self._filterGray(p_sub[:, :, ci], shape_original)
+ return to8U(q)
+
+ def _filterGray(self, p_sub, shape_original):
+ ab_sub = self._guided_filter._computeCoefficients(p_sub)
+ ab = [_upSample(abi, shape=shape_original) for abi in ab_sub]
+ return self._guided_filter._computeOutput(ab, self._I)
+
+
+## Guide filter.
+class GuidedFilter:
+ ## Constructor.
+ # @param I Input guidance image. Color or gray.
+ # @param radius Radius of Guided Filter.
+ # @param epsilon Regularization term of Guided Filter.
+ def __init__(self, I, radius=5, epsilon=0.4):
+ I_32F = to32F(I)
+
+ if _isGray(I):
+ self._guided_filter = GuidedFilterGray(I_32F, radius, epsilon)
+ else:
+ self._guided_filter = GuidedFilterColor(I_32F, radius, epsilon)
+
+ ## Apply filter for the input image.
+ # @param p Input image for the filtering.
+ def filter(self, p):
+ return to8U(self._guided_filter.filter(p))
+
+
+## Common parts of guided filter.
+#
+# This class is used by guided_filter class. GuidedFilterGray and GuidedFilterColor.
+# Based on guided_filter._computeCoefficients, guided_filter._computeOutput,
+# GuidedFilterCommon.filter computes filtered image for color and gray.
+class GuidedFilterCommon:
+ def __init__(self, guided_filter):
+ self._guided_filter = guided_filter
+
+ ## Apply filter for the input image.
+ # @param p Input image for the filtering.
+ def filter(self, p):
+ p_32F = to32F(p)
+ if _isGray(p_32F):
+ return self._filterGray(p_32F)
+
+ cs = p.shape[2]
+ q = np.array(p_32F)
+
+ for ci in range(cs):
+ q[:, :, ci] = self._filterGray(p_32F[:, :, ci])
+ return q
+
+ def _filterGray(self, p):
+ ab = self._guided_filter._computeCoefficients(p)
+ return self._guided_filter._computeOutput(ab, self._guided_filter._I)
+
+
+## Guided filter for gray guidance image.
+class GuidedFilterGray:
+ # @param I Input gray guidance image.
+ # @param radius Radius of Guided Filter.
+ # @param epsilon Regularization term of Guided Filter.
+ def __init__(self, I, radius=5, epsilon=0.4):
+ self._radius = 2 * radius + 1
+ self._epsilon = epsilon
+ self._I = to32F(I)
+ self._initFilter()
+ self._filter_common = GuidedFilterCommon(self)
+
+ ## Apply filter for the input image.
+ # @param p Input image for the filtering.
+ def filter(self, p):
+ return self._filter_common.filter(p)
+
+ def _initFilter(self):
+ I = self._I
+ r = self._radius
+ self._I_mean = cv2.blur(I, (r, r))
+ I_mean_sq = cv2.blur(I ** 2, (r, r))
+ self._I_var = I_mean_sq - self._I_mean ** 2
+
+ def _computeCoefficients(self, p):
+ r = self._radius
+ p_mean = cv2.blur(p, (r, r))
+ p_cov = p_mean - self._I_mean * p_mean
+ a = p_cov / (self._I_var + self._epsilon)
+ b = p_mean - a * self._I_mean
+ a_mean = cv2.blur(a, (r, r))
+ b_mean = cv2.blur(b, (r, r))
+ return a_mean, b_mean
+
+ def _computeOutput(self, ab, I):
+ a_mean, b_mean = ab
+ return a_mean * I + b_mean
+
+
+## Guided filter for color guidance image.
+class GuidedFilterColor:
+ # @param I Input color guidance image.
+ # @param radius Radius of Guided Filter.
+ # @param epsilon Regularization term of Guided Filter.
+ def __init__(self, I, radius=5, epsilon=0.2):
+ self._radius = 2 * radius + 1
+ self._epsilon = epsilon
+ self._I = to32F(I)
+ self._initFilter()
+ self._filter_common = GuidedFilterCommon(self)
+
+ ## Apply filter for the input image.
+ # @param p Input image for the filtering.
+ def filter(self, p):
+ return self._filter_common.filter(p)
+
+ def _initFilter(self):
+ I = self._I
+ r = self._radius
+ eps = self._epsilon
+
+ Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
+
+ self._Ir_mean = cv2.blur(Ir, (r, r))
+ self._Ig_mean = cv2.blur(Ig, (r, r))
+ self._Ib_mean = cv2.blur(Ib, (r, r))
+
+ Irr_var = cv2.blur(Ir ** 2, (r, r)) - self._Ir_mean ** 2 + eps
+ Irg_var = cv2.blur(Ir * Ig, (r, r)) - self._Ir_mean * self._Ig_mean
+ Irb_var = cv2.blur(Ir * Ib, (r, r)) - self._Ir_mean * self._Ib_mean
+ Igg_var = cv2.blur(Ig * Ig, (r, r)) - self._Ig_mean * self._Ig_mean + eps
+ Igb_var = cv2.blur(Ig * Ib, (r, r)) - self._Ig_mean * self._Ib_mean
+ Ibb_var = cv2.blur(Ib * Ib, (r, r)) - self._Ib_mean * self._Ib_mean + eps
+
+ Irr_inv = Igg_var * Ibb_var - Igb_var * Igb_var
+ Irg_inv = Igb_var * Irb_var - Irg_var * Ibb_var
+ Irb_inv = Irg_var * Igb_var - Igg_var * Irb_var
+ Igg_inv = Irr_var * Ibb_var - Irb_var * Irb_var
+ Igb_inv = Irb_var * Irg_var - Irr_var * Igb_var
+ Ibb_inv = Irr_var * Igg_var - Irg_var * Irg_var
+
+ I_cov = Irr_inv * Irr_var + Irg_inv * Irg_var + Irb_inv * Irb_var
+ Irr_inv /= I_cov
+ Irg_inv /= I_cov
+ Irb_inv /= I_cov
+ Igg_inv /= I_cov
+ Igb_inv /= I_cov
+ Ibb_inv /= I_cov
+
+ self._Irr_inv = Irr_inv
+ self._Irg_inv = Irg_inv
+ self._Irb_inv = Irb_inv
+ self._Igg_inv = Igg_inv
+ self._Igb_inv = Igb_inv
+ self._Ibb_inv = Ibb_inv
+
+ def _computeCoefficients(self, p):
+ r = self._radius
+ I = self._I
+ Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
+
+ p_mean = cv2.blur(p, (r, r))
+
+ Ipr_mean = cv2.blur(Ir * p, (r, r))
+ Ipg_mean = cv2.blur(Ig * p, (r, r))
+ Ipb_mean = cv2.blur(Ib * p, (r, r))
+
+ Ipr_cov = Ipr_mean - self._Ir_mean * p_mean
+ Ipg_cov = Ipg_mean - self._Ig_mean * p_mean
+ Ipb_cov = Ipb_mean - self._Ib_mean * p_mean
+
+ ar = self._Irr_inv * Ipr_cov + self._Irg_inv * Ipg_cov + self._Irb_inv * Ipb_cov
+ ag = self._Irg_inv * Ipr_cov + self._Igg_inv * Ipg_cov + self._Igb_inv * Ipb_cov
+ ab = self._Irb_inv * Ipr_cov + self._Igb_inv * Ipg_cov + self._Ibb_inv * Ipb_cov
+ b = p_mean - ar * self._Ir_mean - ag * self._Ig_mean - ab * self._Ib_mean
+
+ ar_mean = cv2.blur(ar, (r, r))
+ ag_mean = cv2.blur(ag, (r, r))
+ ab_mean = cv2.blur(ab, (r, r))
+ b_mean = cv2.blur(b, (r, r))
+
+ return ar_mean, ag_mean, ab_mean, b_mean
+
+ def _computeOutput(self, ab, I):
+ ar_mean, ag_mean, ab_mean, b_mean = ab
+
+ Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
+
+ q = (ar_mean * Ir +
+ ag_mean * Ig +
+ ab_mean * Ib +
+ b_mean)
+
+ return q
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5884c61967b0dca3b1d61dbfc457c579bd05375d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/__init__.py
@@ -0,0 +1,68 @@
+import os
+from .inference import init_segmentor, inference_segmentor, show_result_pyplot
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME
+import torch
+
+from custom_mmpkg.custom_mmseg.core.evaluation import get_palette
+
+config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "upernet_global_small.py")
+
+
+
+class UniformerSegmentor:
+ def __init__(self, netNetwork):
+ self.model = netNetwork
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="upernet_global_small.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ netNetwork = init_segmentor(config_file, model_path, device="cpu")
+ netNetwork.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(model_path)['state_dict'].items()})
+ netNetwork.eval()
+
+ return cls(netNetwork)
+
+ def to(self, device):
+ self.model.to(device)
+ return self
+
+ def _inference(self, img):
+ if next(self.model.parameters()).device.type == 'mps':
+ # adaptive_avg_pool2d can fail on MPS, workaround with CPU
+ import torch.nn.functional
+
+ orig_adaptive_avg_pool2d = torch.nn.functional.adaptive_avg_pool2d
+ def cpu_if_exception(input, *args, **kwargs):
+ try:
+ return orig_adaptive_avg_pool2d(input, *args, **kwargs)
+ except:
+ return orig_adaptive_avg_pool2d(input.cpu(), *args, **kwargs).to(input.device)
+
+ try:
+ torch.nn.functional.adaptive_avg_pool2d = cpu_if_exception
+ result = inference_segmentor(self.model, img)
+ finally:
+ torch.nn.functional.adaptive_avg_pool2d = orig_adaptive_avg_pool2d
+ else:
+ result = inference_segmentor(self.model, img)
+
+ res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1)
+ return res_img
+
+ def __call__(self, input_image=None, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ detected_map = self._inference(input_image)
+ detected_map = remove_pad(HWC3(detected_map))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/ade20k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eae91063c11086a58d140f8a53ff4195da0f312
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/ade20k.py
@@ -0,0 +1,54 @@
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', reduce_zero_label=True),
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2048, 512),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/chase_db1.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f436bb0c6ba0fa1378a8a5cadfa64244a85e023d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/chase_db1.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'ChaseDB1Dataset'
+data_root = 'data/CHASE_DB1'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (960, 999)
+crop_size = (128, 128)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/cityscapes.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c15aea051caf2a82289dc86294a5bb3b274f57
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/cityscapes.py
@@ -0,0 +1,54 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 1024)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2048, 1024),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='leftImg8bit/train',
+ ann_dir='gtFine/train',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='leftImg8bit/val',
+ ann_dir='gtFine/val',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='leftImg8bit/val',
+ ann_dir='gtFine/val',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/cityscapes_769x769.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/cityscapes_769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7bbefafa9a3a84f70b95c4f4db208421a7fe3e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/cityscapes_769x769.py
@@ -0,0 +1,35 @@
+_base_ = './cityscapes.py'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (769, 769)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2049, 1025),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/drive.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..510b0c9bbbdeb54d569d3d89ecc1bdae3dd0e90b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/drive.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'DRIVEDataset'
+data_root = 'data/DRIVE'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (584, 565)
+crop_size = (64, 64)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/hrf.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a195d859af8cde4dd77c141a9d875b3c7f9196
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/hrf.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'HRFDataset'
+data_root = 'data/HRF'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (2336, 3504)
+crop_size = (256, 256)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_context.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8404d6e360873e49d769db8b4b9092b613b412f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_context.py
@@ -0,0 +1,60 @@
+# dataset settings
+dataset_type = 'PascalContextDataset'
+data_root = 'data/VOCdevkit/VOC2010/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+img_scale = (520, 520)
+crop_size = (480, 480)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/train.txt',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_context_59.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_context_59.py
new file mode 100644
index 0000000000000000000000000000000000000000..c846de3adb3f5d0f5716601d2b0d4b0f4759af41
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_context_59.py
@@ -0,0 +1,60 @@
+# dataset settings
+dataset_type = 'PascalContextDataset59'
+data_root = 'data/VOCdevkit/VOC2010/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+img_scale = (520, 520)
+crop_size = (480, 480)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', reduce_zero_label=True),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/train.txt',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_voc12.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_voc12.py
new file mode 100644
index 0000000000000000000000000000000000000000..a51ab620e050f48dc6422626e7322f9ca991d47c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_voc12.py
@@ -0,0 +1,57 @@
+# dataset settings
+dataset_type = 'PascalVOCDataset'
+data_root = 'data/VOCdevkit/VOC2012'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2048, 512),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClass',
+ split='ImageSets/Segmentation/train.txt',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClass',
+ split='ImageSets/Segmentation/val.txt',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClass',
+ split='ImageSets/Segmentation/val.txt',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_voc12_aug.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_voc12_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..497563d2c1d2a3587c846f8e4584a4d233ccaa6d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/pascal_voc12_aug.py
@@ -0,0 +1,9 @@
+_base_ = './pascal_voc12.py'
+# dataset settings
+data = dict(
+ train=dict(
+ ann_dir=['SegmentationClass', 'SegmentationClassAug'],
+ split=[
+ 'ImageSets/Segmentation/train.txt',
+ 'ImageSets/Segmentation/aug.txt'
+ ]))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/stare.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa118cfa213c91aa31cb7c96324508c910f40ac1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/datasets/stare.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'STAREDataset'
+data_root = 'data/STARE'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (605, 700)
+crop_size = (128, 128)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/default_runtime.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..87955b3010369c48839d3bbdcabdd4b632f97821
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/default_runtime.py
@@ -0,0 +1,14 @@
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook', by_epoch=False),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+cudnn_benchmark = True
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ann_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ann_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed54b25dd04cf43480f5deaaff03b48a12834402
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ann_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='ANNHead',
+ in_channels=[1024, 2048],
+ in_index=[2, 3],
+ channels=512,
+ project_channels=256,
+ query_scales=(1, ),
+ key_pool_scales=(1, 3, 6, 8),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/apcnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/apcnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8379f80ac9ae3df54309f9669927c4b2865c8e3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/apcnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='APCHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ pool_scales=(1, 2, 3, 6),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ccnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ccnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..d456a2e96ff36ca84a02d028ed82087c14f7e7cd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ccnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='CCHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ recurrence=2,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/cgnet.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/cgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a9ab7acbc28152177391b5168ae29c42e40cb62
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/cgnet.py
@@ -0,0 +1,35 @@
+# model settings
+norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='CGNet',
+ norm_cfg=norm_cfg,
+ in_channels=3,
+ num_channels=(32, 64, 128),
+ num_blocks=(3, 21),
+ dilations=(2, 4),
+ reductions=(8, 16)),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=256,
+ in_index=2,
+ channels=256,
+ num_convs=0,
+ concat_input=False,
+ dropout_ratio=0,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ loss_decode=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=[
+ 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
+ 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
+ 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
+ 10.396974, 10.055647
+ ])),
+ # model training and testing settings
+ train_cfg=dict(sampler=None),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/danet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/danet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..927a825dfdb2a9134cdab83c8cdeced91114b1de
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/danet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DAHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ pam_channels=64,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4ecfff1c5a782884c80eea1dfb12062e2ae07ed
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='ASPPHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dilations=(1, 12, 24, 36),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9b4b26859bc46db89d19fcde79741587ff0a5a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
@@ -0,0 +1,50 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UNet',
+ in_channels=3,
+ base_channels=64,
+ num_stages=5,
+ strides=(1, 1, 1, 1, 1),
+ enc_num_convs=(2, 2, 2, 2, 2),
+ dec_num_convs=(2, 2, 2, 2),
+ downsamples=(True, True, True, True),
+ enc_dilations=(1, 1, 1, 1, 1),
+ dec_dilations=(1, 1, 1, 1),
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ norm_eval=False),
+ decode_head=dict(
+ type='ASPPHead',
+ in_channels=64,
+ in_index=4,
+ channels=16,
+ dilations=(1, 12, 24, 36),
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=128,
+ in_index=3,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5315345eb7691da28f7e9d077f2eb66217110d8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DepthwiseSeparableASPPHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dilations=(1, 12, 24, 36),
+ c1_in_channels=256,
+ c1_channels=48,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/dmnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/dmnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..501ad117bc5ccb311bc55f4b693452dc861ea743
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/dmnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DMHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ filter_sizes=(1, 3, 5, 7),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/dnl_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/dnl_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..8151467633f8a69532437cc5fc2d62cd95bb709d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/dnl_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DNLHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dropout_ratio=0.1,
+ reduction=2,
+ use_scale=True,
+ mode='embedded_gaussian',
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/emanet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/emanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e9f53eded8c3ed519dc0a9d12bc1c4cedbbbdd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/emanet_r50-d8.py
@@ -0,0 +1,47 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='EMAHead',
+ in_channels=2048,
+ in_index=3,
+ channels=256,
+ ema_channels=512,
+ num_bases=64,
+ num_stages=3,
+ momentum=0.1,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/encnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/encnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..6109d803c8f261f24e90075d6b0f86509c021a82
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/encnet_r50-d8.py
@@ -0,0 +1,48 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='EncHead',
+ in_channels=[512, 1024, 2048],
+ in_index=(1, 2, 3),
+ channels=512,
+ num_codes=32,
+ use_se_loss=True,
+ add_lateral=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_se_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fast_scnn.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cde395ee07e3d2ca7b9d7f71810bd3105cd5bab
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fast_scnn.py
@@ -0,0 +1,57 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='FastSCNN',
+ downsample_dw_channels=(32, 48),
+ global_in_channels=64,
+ global_block_channels=(64, 96, 128),
+ global_block_strides=(2, 2, 1),
+ global_out_channels=128,
+ higher_in_channels=64,
+ lower_in_channels=128,
+ fusion_out_channels=128,
+ out_indices=(0, 1, 2),
+ norm_cfg=norm_cfg,
+ align_corners=False),
+ decode_head=dict(
+ type='DepthwiseSeparableFCNHead',
+ in_channels=128,
+ channels=128,
+ concat_input=False,
+ num_classes=19,
+ in_index=-1,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+ auxiliary_head=[
+ dict(
+ type='FCNHead',
+ in_channels=128,
+ channels=32,
+ num_convs=1,
+ num_classes=19,
+ in_index=-2,
+ norm_cfg=norm_cfg,
+ concat_input=False,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+ dict(
+ type='FCNHead',
+ in_channels=64,
+ channels=32,
+ num_convs=1,
+ num_classes=19,
+ in_index=-3,
+ norm_cfg=norm_cfg,
+ concat_input=False,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_hr18.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_hr18.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7258530b011f146790e5ef2eae162af874b1c3e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_hr18.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://msra/hrnetv2_w18',
+ backbone=dict(
+ type='HRNet',
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144)))),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=[18, 36, 72, 144],
+ in_index=(0, 1, 2, 3),
+ channels=sum([18, 36, 72, 144]),
+ input_transform='resize_concat',
+ kernel_size=1,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a924ab493b97e27063069423fb7e8db20d67fa26
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_r50-d8.py
@@ -0,0 +1,45 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ num_convs=2,
+ concat_input=True,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_unet_s5-d16.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..e457bcca4d9cfd768d4c5f9de2c01a0bf1462c2a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
@@ -0,0 +1,51 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UNet',
+ in_channels=3,
+ base_channels=64,
+ num_stages=5,
+ strides=(1, 1, 1, 1, 1),
+ enc_num_convs=(2, 2, 2, 2, 2),
+ dec_num_convs=(2, 2, 2, 2),
+ downsamples=(True, True, True, True),
+ enc_dilations=(1, 1, 1, 1, 1),
+ dec_dilations=(1, 1, 1, 1),
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ norm_eval=False),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=64,
+ in_index=4,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=128,
+ in_index=3,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fpn_r50.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fpn_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ebc85f9edb9126c94be1f673e1cc3ed16f103f2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fpn_r50.py
@@ -0,0 +1,36 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=4),
+ decode_head=dict(
+ type='FPNHead',
+ in_channels=[256, 256, 256, 256],
+ in_index=[0, 1, 2, 3],
+ feature_strides=[4, 8, 16, 32],
+ channels=128,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fpn_uniformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fpn_uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f7d0f380a6be50409a9cbbe0ae9aa9b42d4990
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/fpn_uniformer.py
@@ -0,0 +1,35 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='UniFormer',
+ embed_dim=[64, 128, 320, 512],
+ layers=[3, 4, 8, 3],
+ head_dim=64,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1),
+ neck=dict(
+ type='FPN',
+ in_channels=[64, 128, 320, 512],
+ out_channels=256,
+ num_outs=4),
+ decode_head=dict(
+ type='FPNHead',
+ in_channels=[256, 256, 256, 256],
+ in_index=[0, 1, 2, 3],
+ feature_strides=[4, 8, 16, 32],
+ channels=128,
+ dropout_ratio=0.1,
+ num_classes=150,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole')
+)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/gcnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/gcnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..6232154dd6a24f0d122531c1a2674a7fe4c56c43
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/gcnet_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='GCHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ ratio=1 / 4.,
+ pooling_type='att',
+ fusion_types=('channel_add', ),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/lraspp_m-v3-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb96325c33a3f19a64d9258fe21714e9043721fb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
@@ -0,0 +1,25 @@
+# model settings
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='MobileNetV3',
+ arch='large',
+ out_indices=(1, 3, 16),
+ norm_cfg=norm_cfg),
+ decode_head=dict(
+ type='LRASPPHead',
+ in_channels=(16, 24, 960),
+ in_index=(0, 1, 2),
+ channels=128,
+ input_transform='multiple_select',
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/nonlocal_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/nonlocal_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b64a9dc4643989231ccf2c8bc7f14592809b284
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/nonlocal_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='NLHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dropout_ratio=0.1,
+ reduction=2,
+ use_scale=True,
+ mode='embedded_gaussian',
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ocrnet_hr18.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ocrnet_hr18.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a390f355ac2534269e0f90f0920019dadac0c06
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ocrnet_hr18.py
@@ -0,0 +1,68 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='CascadeEncoderDecoder',
+ num_stages=2,
+ pretrained='open-mmlab://msra/hrnetv2_w18',
+ backbone=dict(
+ type='HRNet',
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144)))),
+ decode_head=[
+ dict(
+ type='FCNHead',
+ in_channels=[18, 36, 72, 144],
+ channels=sum([18, 36, 72, 144]),
+ in_index=(0, 1, 2, 3),
+ input_transform='resize_concat',
+ kernel_size=1,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ dict(
+ type='OCRHead',
+ in_channels=[18, 36, 72, 144],
+ in_index=(0, 1, 2, 3),
+ input_transform='resize_concat',
+ channels=512,
+ ocr_channels=256,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ocrnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ocrnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c675f5e0ff86407577793454f83f096bf9b8b8d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/ocrnet_r50-d8.py
@@ -0,0 +1,47 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='CascadeEncoderDecoder',
+ num_stages=2,
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=[
+ dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ dict(
+ type='OCRHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ ocr_channels=256,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pointrend_r50.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pointrend_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8e24576a6aa9270d5e7551ec409befc1d9588ee
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pointrend_r50.py
@@ -0,0 +1,56 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='CascadeEncoderDecoder',
+ num_stages=2,
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=4),
+ decode_head=[
+ dict(
+ type='FPNHead',
+ in_channels=[256, 256, 256, 256],
+ in_index=[0, 1, 2, 3],
+ feature_strides=[4, 8, 16, 32],
+ channels=128,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ dict(
+ type='PointHead',
+ in_channels=[256],
+ in_index=[0],
+ channels=256,
+ num_fcs=3,
+ coarse_pred_each_layer=True,
+ dropout_ratio=-1,
+ num_classes=19,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+ ],
+ # model training and testing settings
+ train_cfg=dict(
+ num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
+ test_cfg=dict(
+ mode='whole',
+ subdivision_steps=2,
+ subdivision_num_points=8196,
+ scale_factor=2))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/psanet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/psanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d19cdd9820ae365ed9ec24f0da0f30c63e57b7b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/psanet_r50-d8.py
@@ -0,0 +1,49 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='PSAHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ mask_size=(97, 97),
+ psa_type='bi-direction',
+ compact=False,
+ shrink_factor=2,
+ normalization_factor=1.0,
+ psa_softmax=True,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pspnet_r50-d8.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pspnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc4dd12b5260b303f2452b8be0129b93874d5a6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pspnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='PSPHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ pool_scales=(1, 2, 3, 6),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb48f46f5e90c2c4ba445a60e43fba9cbe2a6233
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
@@ -0,0 +1,50 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UNet',
+ in_channels=3,
+ base_channels=64,
+ num_stages=5,
+ strides=(1, 1, 1, 1, 1),
+ enc_num_convs=(2, 2, 2, 2, 2),
+ dec_num_convs=(2, 2, 2, 2),
+ downsamples=(True, True, True, True),
+ enc_dilations=(1, 1, 1, 1, 1),
+ dec_dilations=(1, 1, 1, 1),
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ norm_eval=False),
+ decode_head=dict(
+ type='PSPHead',
+ in_channels=64,
+ in_index=4,
+ channels=16,
+ pool_scales=(1, 2, 3, 6),
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=128,
+ in_index=3,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/upernet_r50.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/upernet_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c91d0d8919a1b4813f55d76baabce92d789b9e2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/upernet_r50.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='UPerHead',
+ in_channels=[256, 512, 1024, 2048],
+ in_index=[0, 1, 2, 3],
+ pool_scales=(1, 2, 3, 6),
+ channels=512,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/upernet_uniformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/upernet_uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..78791d26c8c0bbe3af437e8f5492e858722b48de
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/models/upernet_uniformer.py
@@ -0,0 +1,43 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UniFormer',
+ embed_dim=[64, 128, 320, 512],
+ layers=[3, 4, 8, 3],
+ head_dim=64,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1),
+ decode_head=dict(
+ type='UPerHead',
+ in_channels=[64, 128, 320, 512],
+ in_index=[0, 1, 2, 3],
+ pool_scales=(1, 2, 3, 6),
+ channels=512,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=320,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_160k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_160k.py
new file mode 100644
index 0000000000000000000000000000000000000000..03ceae0bfba772a7609090fe6bb4e5c22507c683
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_160k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=16000)
+evaluation = dict(interval=16000, metric='mIoU')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_20k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..beeb6175ef81158d6660958366ff3759b1be8dbe
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_20k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(by_epoch=False, interval=2000)
+evaluation = dict(interval=2000, metric='mIoU')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_40k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_40k.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5c09a15114c5fb6d97a410808fa1938cd69eeb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_40k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=40000)
+checkpoint_config = dict(by_epoch=False, interval=4000)
+evaluation = dict(interval=4000, metric='mIoU')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_80k.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000000000000000000000000000000000..304006152ad4f9055de407a10afa2d11e9824ac6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/inference.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9545e7271910527d1fb7c668a4e557fe55059b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/inference.py
@@ -0,0 +1,137 @@
+
+import torch
+
+import custom_mmpkg.custom_mmcv as mmcv
+from custom_mmpkg.custom_mmcv.parallel import collate, scatter
+from custom_mmpkg.custom_mmcv.runner import load_checkpoint
+from custom_mmpkg.custom_mmseg.datasets.pipelines import Compose
+from custom_mmpkg.custom_mmseg.models import build_segmentor
+
+def init_segmentor(config, checkpoint=None, device='cuda:0'):
+ """Initialize a segmentor from config file.
+
+ Args:
+ config (str or :obj:`mmcv.Config`): Config file path or the config
+ object.
+ checkpoint (str, optional): Checkpoint path. If left as None, the model
+ will not load any weights.
+ device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
+ Use 'cpu' for loading model on CPU.
+ Returns:
+ nn.Module: The constructed segmentor.
+ """
+ if isinstance(config, str):
+ config = mmcv.Config.fromfile(config)
+ elif not isinstance(config, mmcv.Config):
+ raise TypeError('config must be a filename or Config object, '
+ 'but got {}'.format(type(config)))
+ config.model.pretrained = None
+ config.model.train_cfg = None
+ model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
+ if checkpoint is not None:
+ checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+ model.CLASSES = checkpoint['meta']['CLASSES']
+ model.PALETTE = checkpoint['meta']['PALETTE']
+ model.cfg = config # save the config in the model for convenience
+ model.to(device)
+ model.eval()
+ return model
+
+
+class LoadImage:
+ """A simple pipeline to load image."""
+
+ def __call__(self, results):
+ """Call function to load images into results.
+
+ Args:
+ results (dict): A result dict contains the file name
+ of the image to be read.
+
+ Returns:
+ dict: ``results`` will be returned containing loaded image.
+ """
+
+ if isinstance(results['img'], str):
+ results['filename'] = results['img']
+ results['ori_filename'] = results['img']
+ else:
+ results['filename'] = None
+ results['ori_filename'] = None
+ img = mmcv.imread(results['img'])
+ results['img'] = img
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ return results
+
+
+def inference_segmentor(model, img):
+ """Inference image(s) with the segmentor.
+
+ Args:
+ model (nn.Module): The loaded segmentor.
+ imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
+ images.
+
+ Returns:
+ (list[Tensor]): The segmentation result.
+ """
+ cfg = model.cfg
+ device = next(model.parameters()).device # model device
+ # build the data pipeline
+ test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
+ test_pipeline = Compose(test_pipeline)
+ # prepare data
+ data = dict(img=img)
+ data = test_pipeline(data)
+ data = collate([data], samples_per_gpu=1)
+ if next(model.parameters()).is_cuda:
+ # scatter to specified GPU
+ data = scatter(data, [device])[0]
+ else:
+ data['img_metas'] = [i.data[0] for i in data['img_metas']]
+
+ data['img'] = [x.to(device) for x in data['img']]
+
+ # forward the model
+ with torch.no_grad():
+ result = model(return_loss=False, rescale=True, **data)
+ return result
+
+
+def show_result_pyplot(model,
+ img,
+ result,
+ palette=None,
+ fig_size=(15, 10),
+ opacity=0.5,
+ title='',
+ block=True):
+ """Visualize the segmentation results on the image.
+
+ Args:
+ model (nn.Module): The loaded segmentor.
+ img (str or np.ndarray): Image filename or loaded image.
+ result (list): The segmentation result.
+ palette (list[list[int]]] | None): The palette of segmentation
+ map. If None is given, random palette will be generated.
+ Default: None
+ fig_size (tuple): Figure size of the pyplot figure.
+ opacity(float): Opacity of painted segmentation map.
+ Default 0.5.
+ Must be in (0, 1] range.
+ title (str): The title of pyplot figure.
+ Default is ''.
+ block (bool): Whether to block the pyplot figure.
+ Default is True.
+ """
+ if hasattr(model, 'module'):
+ model = model.module
+ img = model.show_result(
+ img, result, palette=palette, show=False, opacity=opacity)
+ # plt.figure(figsize=fig_size)
+ # plt.imshow(mmcv.bgr2rgb(img))
+ # plt.title(title)
+ # plt.tight_layout()
+ # plt.show(block=block)
+ return mmcv.bgr2rgb(img)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/mmcv_custom/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/mmcv_custom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f8e3caf2364a8876cd0e85cf87d5dd00630c716
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/mmcv_custom/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+from .checkpoint import load_checkpoint
+
+__all__ = ['load_checkpoint']
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/mmcv_custom/checkpoint.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/mmcv_custom/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34efe83ad41243f32e5b844fb7cd2e06c6040a0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/mmcv_custom/checkpoint.py
@@ -0,0 +1,500 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+
+import custom_mmpkg.custom_mmcv as mmcv
+from custom_mmpkg.custom_mmcv.fileio import FileClient
+from custom_mmpkg.custom_mmcv.fileio import load as load_file
+from custom_mmpkg.custom_mmcv.parallel import is_module_wrapper
+from custom_mmpkg.custom_mmcv.utils import mkdir_or_exist
+from custom_mmpkg.custom_mmcv.runner import get_dist_info
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+ mmcv_home = os.path.expanduser(
+ os.getenv(
+ ENV_MMCV_HOME,
+ os.path.join(
+ os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+ mkdir_or_exist(mmcv_home)
+ return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+ """Load state_dict to a module.
+
+ This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+ Default value for ``strict`` is set to ``False`` and the message for
+ param mismatch will be shown even if strict is False.
+
+ Args:
+ module (Module): Module that receives the state_dict.
+ state_dict (OrderedDict): Weights.
+ strict (bool): whether to strictly enforce that the keys
+ in :attr:`state_dict` match the keys returned by this module's
+ :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+ logger (:obj:`logging.Logger`, optional): Logger to log the error
+ message. If not specified, print function will be used.
+ """
+ unexpected_keys = []
+ all_missing_keys = []
+ err_msg = []
+
+ metadata = getattr(state_dict, '_metadata', None)
+ state_dict = state_dict.copy()
+ if metadata is not None:
+ state_dict._metadata = metadata
+
+ # use _load_from_state_dict to enable checkpoint version control
+ def load(module, prefix=''):
+ # recursively check parallel module in case that the model has a
+ # complicated structure, e.g., nn.Module(nn.Module(DDP))
+ if is_module_wrapper(module):
+ module = module.module
+ local_metadata = {} if metadata is None else metadata.get(
+ prefix[:-1], {})
+ module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+ all_missing_keys, unexpected_keys,
+ err_msg)
+ for name, child in module._modules.items():
+ if child is not None:
+ load(child, prefix + name + '.')
+
+ load(module)
+ load = None # break load->load reference cycle
+
+ # ignore "num_batches_tracked" of BN layers
+ missing_keys = [
+ key for key in all_missing_keys if 'num_batches_tracked' not in key
+ ]
+
+ if unexpected_keys:
+ err_msg.append('unexpected key in source '
+ f'state_dict: {", ".join(unexpected_keys)}\n')
+ if missing_keys:
+ err_msg.append(
+ f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+ rank, _ = get_dist_info()
+ if len(err_msg) > 0 and rank == 0:
+ err_msg.insert(
+ 0, 'The model and loaded state dict do not match exactly\n')
+ err_msg = '\n'.join(err_msg)
+ if strict:
+ raise RuntimeError(err_msg)
+ elif logger is not None:
+ logger.warning(err_msg)
+ else:
+ print(err_msg)
+
+
+def load_url_dist(url, model_dir=None):
+ """In distributed setting, this function only download checkpoint at local
+ rank 0."""
+ rank, world_size = get_dist_info()
+ rank = int(os.environ.get('LOCAL_RANK', rank))
+ if rank == 0:
+ checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+ if world_size > 1:
+ torch.distributed.barrier()
+ if rank > 0:
+ checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+ return checkpoint
+
+
+def load_pavimodel_dist(model_path, map_location=None):
+ """In distributed setting, this function only download checkpoint at local
+ rank 0."""
+ try:
+ from pavi import modelcloud
+ except ImportError:
+ raise ImportError(
+ 'Please install pavi to load checkpoint from modelcloud.')
+ rank, world_size = get_dist_info()
+ rank = int(os.environ.get('LOCAL_RANK', rank))
+ if rank == 0:
+ model = modelcloud.get(model_path)
+ with TemporaryDirectory() as tmp_dir:
+ downloaded_file = osp.join(tmp_dir, model.name)
+ model.download(downloaded_file)
+ checkpoint = torch.load(downloaded_file, map_location=map_location)
+ if world_size > 1:
+ torch.distributed.barrier()
+ if rank > 0:
+ model = modelcloud.get(model_path)
+ with TemporaryDirectory() as tmp_dir:
+ downloaded_file = osp.join(tmp_dir, model.name)
+ model.download(downloaded_file)
+ checkpoint = torch.load(
+ downloaded_file, map_location=map_location)
+ return checkpoint
+
+
+def load_fileclient_dist(filename, backend, map_location):
+ """In distributed setting, this function only download checkpoint at local
+ rank 0."""
+ rank, world_size = get_dist_info()
+ rank = int(os.environ.get('LOCAL_RANK', rank))
+ allowed_backends = ['ceph']
+ if backend not in allowed_backends:
+ raise ValueError(f'Load from Backend {backend} is not supported.')
+ if rank == 0:
+ fileclient = FileClient(backend=backend)
+ buffer = io.BytesIO(fileclient.get(filename))
+ checkpoint = torch.load(buffer, map_location=map_location)
+ if world_size > 1:
+ torch.distributed.barrier()
+ if rank > 0:
+ fileclient = FileClient(backend=backend)
+ buffer = io.BytesIO(fileclient.get(filename))
+ checkpoint = torch.load(buffer, map_location=map_location)
+ return checkpoint
+
+
+def get_torchvision_models():
+ model_urls = dict()
+ for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+ if ispkg:
+ continue
+ _zoo = import_module(f'torchvision.models.{name}')
+ if hasattr(_zoo, 'model_urls'):
+ _urls = getattr(_zoo, 'model_urls')
+ model_urls.update(_urls)
+ return model_urls
+
+
+def get_external_models():
+ mmcv_home = _get_mmcv_home()
+ default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+ default_urls = load_file(default_json_path)
+ assert isinstance(default_urls, dict)
+ external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+ if osp.exists(external_json_path):
+ external_urls = load_file(external_json_path)
+ assert isinstance(external_urls, dict)
+ default_urls.update(external_urls)
+
+ return default_urls
+
+
+def get_mmcls_models():
+ mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+ mmcls_urls = load_file(mmcls_json_path)
+
+ return mmcls_urls
+
+
+def get_deprecated_model_names():
+ deprecate_json_path = osp.join(mmcv.__path__[0],
+ 'model_zoo/deprecated.json')
+ deprecate_urls = load_file(deprecate_json_path)
+ assert isinstance(deprecate_urls, dict)
+
+ return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+ state_dict = checkpoint['state_dict']
+ new_state_dict = OrderedDict()
+ for k, v in state_dict.items():
+ if k.startswith('backbone.'):
+ new_state_dict[k[9:]] = v
+ new_checkpoint = dict(state_dict=new_state_dict)
+
+ return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+ """Load checkpoint from somewhere (modelzoo, file, url).
+
+ Args:
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+ details.
+ map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+ Returns:
+ dict | OrderedDict: The loaded checkpoint. It can be either an
+ OrderedDict storing model weights or a dict containing other
+ information, which depends on the checkpoint.
+ """
+ if filename.startswith('modelzoo://'):
+ warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+ 'use "torchvision://" instead')
+ model_urls = get_torchvision_models()
+ model_name = filename[11:]
+ checkpoint = load_url_dist(model_urls[model_name])
+ elif filename.startswith('torchvision://'):
+ model_urls = get_torchvision_models()
+ model_name = filename[14:]
+ checkpoint = load_url_dist(model_urls[model_name])
+ elif filename.startswith('open-mmlab://'):
+ model_urls = get_external_models()
+ model_name = filename[13:]
+ deprecated_urls = get_deprecated_model_names()
+ if model_name in deprecated_urls:
+ warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+ f'of open-mmlab://{deprecated_urls[model_name]}')
+ model_name = deprecated_urls[model_name]
+ model_url = model_urls[model_name]
+ # check if is url
+ if model_url.startswith(('http://', 'https://')):
+ checkpoint = load_url_dist(model_url)
+ else:
+ filename = osp.join(_get_mmcv_home(), model_url)
+ if not osp.isfile(filename):
+ raise IOError(f'{filename} is not a checkpoint file')
+ checkpoint = torch.load(filename, map_location=map_location)
+ elif filename.startswith('mmcls://'):
+ model_urls = get_mmcls_models()
+ model_name = filename[8:]
+ checkpoint = load_url_dist(model_urls[model_name])
+ checkpoint = _process_mmcls_checkpoint(checkpoint)
+ elif filename.startswith(('http://', 'https://')):
+ checkpoint = load_url_dist(filename)
+ elif filename.startswith('pavi://'):
+ model_path = filename[7:]
+ checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+ elif filename.startswith('s3://'):
+ checkpoint = load_fileclient_dist(
+ filename, backend='ceph', map_location=map_location)
+ else:
+ if not osp.isfile(filename):
+ raise IOError(f'{filename} is not a checkpoint file')
+ checkpoint = torch.load(filename, map_location=map_location)
+ return checkpoint
+
+
+def load_checkpoint(model,
+ filename,
+ map_location='cpu',
+ strict=False,
+ logger=None):
+ """Load checkpoint from a file or URI.
+
+ Args:
+ model (Module): Module to load checkpoint.
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+ details.
+ map_location (str): Same as :func:`torch.load`.
+ strict (bool): Whether to allow different params for the model and
+ checkpoint.
+ logger (:mod:`logging.Logger` or None): The logger for error message.
+
+ Returns:
+ dict or OrderedDict: The loaded checkpoint.
+ """
+ checkpoint = _load_checkpoint(filename, map_location)
+ # OrderedDict is a subclass of dict
+ if not isinstance(checkpoint, dict):
+ raise RuntimeError(
+ f'No state_dict found in checkpoint file {filename}')
+ # get state_dict from checkpoint
+ if 'state_dict' in checkpoint:
+ state_dict = checkpoint['state_dict']
+ elif 'model' in checkpoint:
+ state_dict = checkpoint['model']
+ else:
+ state_dict = checkpoint
+ # strip prefix of state_dict
+ if list(state_dict.keys())[0].startswith('module.'):
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+ # for MoBY, load model of online branch
+ if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+ state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+
+ # reshape absolute position embedding
+ if state_dict.get('absolute_pos_embed') is not None:
+ absolute_pos_embed = state_dict['absolute_pos_embed']
+ N1, L, C1 = absolute_pos_embed.size()
+ N2, C2, H, W = model.absolute_pos_embed.size()
+ if N1 != N2 or C1 != C2 or L != H*W:
+ logger.warning("Error in loading absolute_pos_embed, pass")
+ else:
+ state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
+
+ # interpolate position bias table if needed
+ relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+ for table_key in relative_position_bias_table_keys:
+ table_pretrained = state_dict[table_key]
+ table_current = model.state_dict()[table_key]
+ L1, nH1 = table_pretrained.size()
+ L2, nH2 = table_current.size()
+ if nH1 != nH2:
+ logger.warning(f"Error in loading {table_key}, pass")
+ else:
+ if L1 != L2:
+ S1 = int(L1 ** 0.5)
+ S2 = int(L2 ** 0.5)
+ table_pretrained_resized = F.interpolate(
+ table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+ size=(S2, S2), mode='bicubic')
+ state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
+
+ # load state_dict
+ load_state_dict(model, state_dict, strict, logger)
+ return checkpoint
+
+
+def weights_to_cpu(state_dict):
+ """Copy a model state_dict to cpu.
+
+ Args:
+ state_dict (OrderedDict): Model weights on GPU.
+
+ Returns:
+ OrderedDict: Model weights on GPU.
+ """
+ state_dict_cpu = OrderedDict()
+ for key, val in state_dict.items():
+ state_dict_cpu[key] = val.cpu()
+ return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+ """Saves module state to `destination` dictionary.
+
+ This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+ Args:
+ module (nn.Module): The module to generate state_dict.
+ destination (dict): A dict where state will be stored.
+ prefix (str): The prefix for parameters and buffers used in this
+ module.
+ """
+ for name, param in module._parameters.items():
+ if param is not None:
+ destination[prefix + name] = param if keep_vars else param.detach()
+ for name, buf in module._buffers.items():
+ # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+ if buf is not None:
+ destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+ """Returns a dictionary containing a whole state of the module.
+
+ Both parameters and persistent buffers (e.g. running averages) are
+ included. Keys are corresponding parameter and buffer names.
+
+ This method is modified from :meth:`torch.nn.Module.state_dict` to
+ recursively check parallel module in case that the model has a complicated
+ structure, e.g., nn.Module(nn.Module(DDP)).
+
+ Args:
+ module (nn.Module): The module to generate state_dict.
+ destination (OrderedDict): Returned dict for the state of the
+ module.
+ prefix (str): Prefix of the key.
+ keep_vars (bool): Whether to keep the variable property of the
+ parameters. Default: False.
+
+ Returns:
+ dict: A dictionary containing a whole state of the module.
+ """
+ # recursively check parallel module in case that the model has a
+ # complicated structure, e.g., nn.Module(nn.Module(DDP))
+ if is_module_wrapper(module):
+ module = module.module
+
+ # below is the same as torch.nn.Module.state_dict()
+ if destination is None:
+ destination = OrderedDict()
+ destination._metadata = OrderedDict()
+ destination._metadata[prefix[:-1]] = local_metadata = dict(
+ version=module._version)
+ _save_to_state_dict(module, destination, prefix, keep_vars)
+ for name, child in module._modules.items():
+ if child is not None:
+ get_state_dict(
+ child, destination, prefix + name + '.', keep_vars=keep_vars)
+ for hook in module._state_dict_hooks.values():
+ hook_result = hook(module, destination, prefix, local_metadata)
+ if hook_result is not None:
+ destination = hook_result
+ return destination
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+ """Save checkpoint to file.
+
+ The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+ ``optimizer``. By default ``meta`` will contain version and time info.
+
+ Args:
+ model (Module): Module whose params are to be saved.
+ filename (str): Checkpoint filename.
+ optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+ meta (dict, optional): Metadata to be saved in checkpoint.
+ """
+ if meta is None:
+ meta = {}
+ elif not isinstance(meta, dict):
+ raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+ meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+ if is_module_wrapper(model):
+ model = model.module
+
+ if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+ # save class name to the meta
+ meta.update(CLASSES=model.CLASSES)
+
+ checkpoint = {
+ 'meta': meta,
+ 'state_dict': weights_to_cpu(get_state_dict(model))
+ }
+ # save optimizer state dict in the checkpoint
+ if isinstance(optimizer, Optimizer):
+ checkpoint['optimizer'] = optimizer.state_dict()
+ elif isinstance(optimizer, dict):
+ checkpoint['optimizer'] = {}
+ for name, optim in optimizer.items():
+ checkpoint['optimizer'][name] = optim.state_dict()
+
+ if filename.startswith('pavi://'):
+ try:
+ from pavi import modelcloud
+ from pavi.exception import NodeNotFoundError
+ except ImportError:
+ raise ImportError(
+ 'Please install pavi to load checkpoint from modelcloud.')
+ model_path = filename[7:]
+ root = modelcloud.Folder()
+ model_dir, model_name = osp.split(model_path)
+ try:
+ model = modelcloud.get(model_dir)
+ except NodeNotFoundError:
+ model = root.create_training_model(model_dir)
+ with TemporaryDirectory() as tmp_dir:
+ checkpoint_file = osp.join(tmp_dir, model_name)
+ with open(checkpoint_file, 'wb') as f:
+ torch.save(checkpoint, f)
+ f.flush()
+ model.create_file(checkpoint_file, name=model_name)
+ else:
+ mmcv.mkdir_or_exist(osp.dirname(filename))
+ # immediately flush buffer
+ with open(filename, 'wb') as f:
+ torch.save(checkpoint, f)
+ f.flush()
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/uniformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef469097ac67350507f11c3c0942abf2bc5705e4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/uniformer.py
@@ -0,0 +1,421 @@
+# --------------------------------------------------------
+# UniFormer
+# Copyright (c) 2022 SenseTime X-Lab
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Kunchang Li
+# --------------------------------------------------------
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from functools import partial
+from collections import OrderedDict
+from custom_timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from custom_mmpkg.custom_mmseg.utils import get_root_logger
+from custom_mmpkg.custom_mmseg.models.builder import BACKBONES
+
+from .mmcv_custom import load_checkpoint
+
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class CMlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+ self.act = act_layer()
+ self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class CBlock(nn.Module):
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+ self.norm1 = nn.BatchNorm2d(dim)
+ self.conv1 = nn.Conv2d(dim, dim, 1)
+ self.conv2 = nn.Conv2d(dim, dim, 1)
+ self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = nn.BatchNorm2d(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.pos_embed(x)
+ x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x)))))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class SABlock(nn.Module):
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.pos_embed(x)
+ B, N, H, W = x.shape
+ x = x.flatten(2).transpose(1, 2)
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ x = x.transpose(1, 2).reshape(B, N, H, W)
+ return x
+
+
+def window_partition(x, window_size):
+ """
+ Args:
+ x: (B, H, W, C)
+ window_size (int): window size
+ Returns:
+ windows: (num_windows*B, window_size, window_size, C)
+ """
+ B, H, W, C = x.shape
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows
+
+
+def window_reverse(windows, window_size, H, W):
+ """
+ Args:
+ windows: (num_windows*B, window_size, window_size, C)
+ window_size (int): Window size
+ H (int): Height of image
+ W (int): Width of image
+ Returns:
+ x: (B, H, W, C)
+ """
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+ return x
+
+
+class SABlock_Windows(nn.Module):
+ def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.window_size=window_size
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.pos_embed(x)
+ x = x.permute(0, 2, 3, 1)
+ B, H, W, C = x.shape
+ shortcut = x
+ x = self.norm1(x)
+
+ pad_l = pad_t = 0
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+ _, Hp, Wp, _ = x.shape
+
+ x_windows = window_partition(x, self.window_size) # nW*B, window_size, window_size, C
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
+
+ # W-MSA/SW-MSA
+ attn_windows = self.attn(x_windows) # nW*B, window_size*window_size, C
+
+ # merge windows
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+ x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
+
+ # reverse cyclic shift
+ if pad_r > 0 or pad_b > 0:
+ x = x[:, :H, :W, :].contiguous()
+
+ x = shortcut + self.drop_path(x)
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ x = x.permute(0, 3, 1, 2).reshape(B, C, H, W)
+ return x
+
+
+class PatchEmbed(nn.Module):
+ """ Image to Patch Embedding
+ """
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+ self.norm = nn.LayerNorm(embed_dim)
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, x):
+ B, _, H, W = x.shape
+ x = self.proj(x)
+ B, _, H, W = x.shape
+ x = x.flatten(2).transpose(1, 2)
+ x = self.norm(x)
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+ return x
+
+
+@BACKBONES.register_module()
+class UniFormer(nn.Module):
+ """ Vision Transformer
+ A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` -
+ https://arxiv.org/abs/2010.11929
+ """
+ def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512],
+ head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
+ drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0],
+ windows=False, hybrid=False, window_size=14):
+ """
+ Args:
+ layer (list): number of block in each layer
+ img_size (int, tuple): input image size
+ in_chans (int): number of input channels
+ num_classes (int): number of classes for classification head
+ embed_dim (int): embedding dimension
+ head_dim (int): dimension of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+ representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+ drop_rate (float): dropout rate
+ attn_drop_rate (float): attention dropout rate
+ drop_path_rate (float): stochastic depth rate
+ norm_layer (nn.Module): normalization layer
+ pretrained_path (str): path of pretrained model
+ use_checkpoint (bool): whether use checkpoint
+ checkpoint_num (list): index for using checkpoint in every stage
+ windows (bool): whether use window MHRA
+ hybrid (bool): whether use hybrid MHRA
+ window_size (int): size of window (>14)
+ """
+ super().__init__()
+ self.num_classes = num_classes
+ self.use_checkpoint = use_checkpoint
+ self.checkpoint_num = checkpoint_num
+ self.windows = windows
+ print(f'Use Checkpoint: {self.use_checkpoint}')
+ print(f'Checkpoint Number: {self.checkpoint_num}')
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+
+ self.patch_embed1 = PatchEmbed(
+ img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0])
+ self.patch_embed2 = PatchEmbed(
+ img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1])
+ self.patch_embed3 = PatchEmbed(
+ img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2])
+ self.patch_embed4 = PatchEmbed(
+ img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3])
+
+ self.pos_drop = nn.Dropout(p=drop_rate)
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))] # stochastic depth decay rule
+ num_heads = [dim // head_dim for dim in embed_dim]
+ self.blocks1 = nn.ModuleList([
+ CBlock(
+ dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+ for i in range(layers[0])])
+ self.norm1=norm_layer(embed_dim[0])
+ self.blocks2 = nn.ModuleList([
+ CBlock(
+ dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer)
+ for i in range(layers[1])])
+ self.norm2 = norm_layer(embed_dim[1])
+ if self.windows:
+ print('Use local window for all blocks in stage3')
+ self.blocks3 = nn.ModuleList([
+ SABlock_Windows(
+ dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
+ for i in range(layers[2])])
+ elif hybrid:
+ print('Use hybrid window for blocks in stage3')
+ block3 = []
+ for i in range(layers[2]):
+ if (i + 1) % 4 == 0:
+ block3.append(SABlock(
+ dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
+ else:
+ block3.append(SABlock_Windows(
+ dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
+ self.blocks3 = nn.ModuleList(block3)
+ else:
+ print('Use global window for all blocks in stage3')
+ self.blocks3 = nn.ModuleList([
+ SABlock(
+ dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
+ for i in range(layers[2])])
+ self.norm3 = norm_layer(embed_dim[2])
+ self.blocks4 = nn.ModuleList([
+ SABlock(
+ dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer)
+ for i in range(layers[3])])
+ self.norm4 = norm_layer(embed_dim[3])
+
+ # Representation layer
+ if representation_size:
+ self.num_features = representation_size
+ self.pre_logits = nn.Sequential(OrderedDict([
+ ('fc', nn.Linear(embed_dim, representation_size)),
+ ('act', nn.Tanh())
+ ]))
+ else:
+ self.pre_logits = nn.Identity()
+
+ self.apply(self._init_weights)
+ self.init_weights(pretrained=pretrained_path)
+
+ def init_weights(self, pretrained):
+ if isinstance(pretrained, str):
+ logger = get_root_logger()
+ load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+ print(f'Load pretrained model from {pretrained}')
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'pos_embed', 'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ out = []
+ x = self.patch_embed1(x)
+ x = self.pos_drop(x)
+ for i, blk in enumerate(self.blocks1):
+ if self.use_checkpoint and i < self.checkpoint_num[0]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm1(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ x = self.patch_embed2(x)
+ for i, blk in enumerate(self.blocks2):
+ if self.use_checkpoint and i < self.checkpoint_num[1]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm2(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ x = self.patch_embed3(x)
+ for i, blk in enumerate(self.blocks3):
+ if self.use_checkpoint and i < self.checkpoint_num[2]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm3(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ x = self.patch_embed4(x)
+ for i, blk in enumerate(self.blocks4):
+ if self.use_checkpoint and i < self.checkpoint_num[3]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm4(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ return tuple(out)
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ return x
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/upernet_global_small.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/upernet_global_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83a498f13383a00ade4f7d717731efb73ccea49
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/uniformer/upernet_global_small.py
@@ -0,0 +1,44 @@
+_base_ = [
+ 'configs/_base_/models/upernet_uniformer.py',
+ 'configs/_base_/datasets/ade20k.py',
+ 'configs/_base_/default_runtime.py',
+ 'configs/_base_/schedules/schedule_160k.py'
+]
+
+custom_imports = dict(
+ imports=['custom_controlnet_aux.uniformer.uniformer'],
+ allow_failed_imports=False
+)
+
+model = dict(
+ backbone=dict(
+ type='UniFormer',
+ embed_dim=[64, 128, 320, 512],
+ layers=[3, 4, 8, 3],
+ head_dim=64,
+ drop_path_rate=0.25,
+ windows=False,
+ hybrid=False
+ ),
+ decode_head=dict(
+ in_channels=[64, 128, 320, 512],
+ num_classes=150
+ ),
+ auxiliary_head=dict(
+ in_channels=320,
+ num_classes=150
+ ))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
+optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
+ paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
+ 'relative_position_bias_table': dict(decay_mult=0.),
+ 'norm': dict(decay_mult=0.)}))
+
+lr_config = dict(_delete_=True, policy='poly',
+ warmup='linear',
+ warmup_iters=1500,
+ warmup_ratio=1e-6,
+ power=1.0, min_lr=0.0, by_epoch=False)
+
+data=dict(samples_per_gpu=2)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee019b6912e21fde525850b1eac59e41a719e9a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/__init__.py
@@ -0,0 +1,195 @@
+import os
+import warnings
+
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import resize_image_with_pad,common_input_validate, custom_hf_download, UNIMATCH_MODEL_NAME
+from .utils.flow_viz import save_vis_flow_tofile, flow_to_image
+from .unimatch.unimatch import UniMatch
+import torch.nn.functional as F
+from argparse import Namespace
+
+def inference_flow(model,
+ image1, #np array of HWC
+ image2,
+ padding_factor=8,
+ inference_size=None,
+ attn_type='swin',
+ attn_splits_list=None,
+ corr_radius_list=None,
+ prop_radius_list=None,
+ num_reg_refine=1,
+ pred_bidir_flow=False,
+ pred_bwd_flow=False,
+ fwd_bwd_consistency_check=False,
+ device="cpu",
+ **kwargs
+ ):
+ fixed_inference_size = inference_size
+ transpose_img = False
+ image1 = torch.from_numpy(image1).permute(2, 0, 1).float().unsqueeze(0).to(device)
+ image2 = torch.from_numpy(image2).permute(2, 0, 1).float().unsqueeze(0).to(device)
+
+ # the model is trained with size: width > height
+ if image1.size(-2) > image1.size(-1):
+ image1 = torch.transpose(image1, -2, -1)
+ image2 = torch.transpose(image2, -2, -1)
+ transpose_img = True
+
+ nearest_size = [int(np.ceil(image1.size(-2) / padding_factor)) * padding_factor,
+ int(np.ceil(image1.size(-1) / padding_factor)) * padding_factor]
+ # resize to nearest size or specified size
+ inference_size = nearest_size if fixed_inference_size is None else fixed_inference_size
+ assert isinstance(inference_size, list) or isinstance(inference_size, tuple)
+ ori_size = image1.shape[-2:]
+
+ # resize before inference
+ if inference_size[0] != ori_size[0] or inference_size[1] != ori_size[1]:
+ image1 = F.interpolate(image1, size=inference_size, mode='bilinear',
+ align_corners=True)
+ image2 = F.interpolate(image2, size=inference_size, mode='bilinear',
+ align_corners=True)
+ if pred_bwd_flow:
+ image1, image2 = image2, image1
+
+ results_dict = model(image1, image2,
+ attn_type=attn_type,
+ attn_splits_list=attn_splits_list,
+ corr_radius_list=corr_radius_list,
+ prop_radius_list=prop_radius_list,
+ num_reg_refine=num_reg_refine,
+ task='flow',
+ pred_bidir_flow=pred_bidir_flow,
+ )
+ flow_pr = results_dict['flow_preds'][-1] # [B, 2, H, W]
+
+ # resize back
+ if inference_size[0] != ori_size[0] or inference_size[1] != ori_size[1]:
+ flow_pr = F.interpolate(flow_pr, size=ori_size, mode='bilinear',
+ align_corners=True)
+ flow_pr[:, 0] = flow_pr[:, 0] * ori_size[-1] / inference_size[-1]
+ flow_pr[:, 1] = flow_pr[:, 1] * ori_size[-2] / inference_size[-2]
+
+ if transpose_img:
+ flow_pr = torch.transpose(flow_pr, -2, -1)
+
+ flow = flow_pr[0].permute(1, 2, 0).cpu().numpy() # [H, W, 2]
+
+ vis_image = flow_to_image(flow)
+
+ # also predict backward flow
+ if pred_bidir_flow:
+ assert flow_pr.size(0) == 2 # [2, H, W, 2]
+ flow_bwd = flow_pr[1].permute(1, 2, 0).cpu().numpy() # [H, W, 2]
+ vis_image = flow_to_image(flow_bwd)
+ flow = flow_bwd
+ return flow, vis_image
+
+MODEL_CONFIGS = {
+ "gmflow-scale1": Namespace(
+ num_scales=1,
+ upsample_factor=8,
+
+ attn_type="swin",
+ feature_channels=128,
+ num_head=1,
+ ffn_dim_expansion=4,
+ num_transformer_layers=6,
+
+ attn_splits_list=[2],
+ corr_radius_list=[-1],
+ prop_radius_list=[-1],
+
+ reg_refine=False,
+ num_reg_refine=1
+ ),
+ "gmflow-scale2": Namespace(
+ num_scales=2,
+ upsample_factor=4,
+ padding_factor=32,
+
+ attn_type="swin",
+ feature_channels=128,
+ num_head=1,
+ ffn_dim_expansion=4,
+ num_transformer_layers=6,
+
+ attn_splits_list=[2, 8],
+ corr_radius_list=[-1, 4],
+ prop_radius_list=[-1, 1],
+
+ reg_refine=False,
+ num_reg_refine=1
+ ),
+ "gmflow-scale2-regrefine6": Namespace(
+ num_scales=2,
+ upsample_factor=4,
+ padding_factor=32,
+
+ attn_type="swin",
+ feature_channels=128,
+ num_head=1,
+ ffn_dim_expansion=4,
+ num_transformer_layers=6,
+
+ attn_splits_list=[2, 8],
+ corr_radius_list=[-1, 4],
+ prop_radius_list=[-1, 1],
+
+ reg_refine=True,
+ num_reg_refine=6
+ )
+}
+
+class UnimatchDetector:
+ def __init__(self, unimatch, config_args):
+ self.unimatch = unimatch
+ self.config_args = config_args
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=UNIMATCH_MODEL_NAME, filename="gmflow-scale2-regrefine6-mixdata.pth"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+ config_args = None
+ for key in list(MODEL_CONFIGS.keys())[::-1]:
+ if key in filename:
+ config_args = MODEL_CONFIGS[key]
+ break
+ assert config_args, f"Couldn't find hardcoded Unimatch config for {filename}"
+
+ model = UniMatch(feature_channels=config_args.feature_channels,
+ num_scales=config_args.num_scales,
+ upsample_factor=config_args.upsample_factor,
+ num_head=config_args.num_head,
+ ffn_dim_expansion=config_args.ffn_dim_expansion,
+ num_transformer_layers=config_args.num_transformer_layers,
+ reg_refine=config_args.reg_refine,
+ task='flow')
+
+ sd = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(sd['model'])
+ return cls(model, config_args)
+
+ def to(self, device):
+ self.unimatch.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, image1, image2, detect_resolution=512, output_type="pil", upscale_method="INTER_CUBIC", pred_bwd_flow=False, pred_bidir_flow=False, **kwargs):
+ assert image1.shape == image2.shape, f"[Unimatch] image1 and image2 must have the same size, got {image1.shape} and {image2.shape}"
+
+ image1, output_type = common_input_validate(image1, output_type, **kwargs)
+ #image1, remove_pad = resize_image_with_pad(image1, detect_resolution, upscale_method)
+ image2, output_type = common_input_validate(image2, output_type, **kwargs)
+ #image2, remove_pad = resize_image_with_pad(image2, detect_resolution, upscale_method)
+ with torch.no_grad():
+ flow, vis_image = inference_flow(self.unimatch, image1, image2, device=self.device, pred_bwd_flow=pred_bwd_flow, pred_bidir_flow=pred_bidir_flow, **vars(self.config_args))
+
+ if output_type == "pil":
+ vis_image = Image.fromarray(vis_image)
+
+ return flow, vis_image
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/attention.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a10f75890d14ac41f0b6a391adcc6ed32536f375
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/attention.py
@@ -0,0 +1,253 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import split_feature, merge_splits, split_feature_1d, merge_splits_1d
+
+
+def single_head_full_attention(q, k, v):
+ # q, k, v: [B, L, C]
+ assert q.dim() == k.dim() == v.dim() == 3
+
+ scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** .5) # [B, L, L]
+ attn = torch.softmax(scores, dim=2) # [B, L, L]
+ out = torch.matmul(attn, v) # [B, L, C]
+
+ return out
+
+
+def single_head_full_attention_1d(q, k, v,
+ h=None,
+ w=None,
+ ):
+ # q, k, v: [B, L, C]
+
+ assert h is not None and w is not None
+ assert q.size(1) == h * w
+
+ b, _, c = q.size()
+
+ q = q.view(b, h, w, c) # [B, H, W, C]
+ k = k.view(b, h, w, c)
+ v = v.view(b, h, w, c)
+
+ scale_factor = c ** 0.5
+
+ scores = torch.matmul(q, k.permute(0, 1, 3, 2)) / scale_factor # [B, H, W, W]
+
+ attn = torch.softmax(scores, dim=-1)
+
+ out = torch.matmul(attn, v).view(b, -1, c) # [B, H*W, C]
+
+ return out
+
+
+def single_head_split_window_attention(q, k, v,
+ num_splits=1,
+ with_shift=False,
+ h=None,
+ w=None,
+ attn_mask=None,
+ ):
+ # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+ # q, k, v: [B, L, C]
+ assert q.dim() == k.dim() == v.dim() == 3
+
+ assert h is not None and w is not None
+ assert q.size(1) == h * w
+
+ b, _, c = q.size()
+
+ b_new = b * num_splits * num_splits
+
+ window_size_h = h // num_splits
+ window_size_w = w // num_splits
+
+ q = q.view(b, h, w, c) # [B, H, W, C]
+ k = k.view(b, h, w, c)
+ v = v.view(b, h, w, c)
+
+ scale_factor = c ** 0.5
+
+ if with_shift:
+ assert attn_mask is not None # compute once
+ shift_size_h = window_size_h // 2
+ shift_size_w = window_size_w // 2
+
+ q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+ k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+ v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+
+ q = split_feature(q, num_splits=num_splits, channel_last=True) # [B*K*K, H/K, W/K, C]
+ k = split_feature(k, num_splits=num_splits, channel_last=True)
+ v = split_feature(v, num_splits=num_splits, channel_last=True)
+
+ scores = torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)
+ ) / scale_factor # [B*K*K, H/K*W/K, H/K*W/K]
+
+ if with_shift:
+ scores += attn_mask.repeat(b, 1, 1)
+
+ attn = torch.softmax(scores, dim=-1)
+
+ out = torch.matmul(attn, v.view(b_new, -1, c)) # [B*K*K, H/K*W/K, C]
+
+ out = merge_splits(out.view(b_new, h // num_splits, w // num_splits, c),
+ num_splits=num_splits, channel_last=True) # [B, H, W, C]
+
+ # shift back
+ if with_shift:
+ out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2))
+
+ out = out.view(b, -1, c)
+
+ return out
+
+
+def single_head_split_window_attention_1d(q, k, v,
+ relative_position_bias=None,
+ num_splits=1,
+ with_shift=False,
+ h=None,
+ w=None,
+ attn_mask=None,
+ ):
+ # q, k, v: [B, L, C]
+
+ assert h is not None and w is not None
+ assert q.size(1) == h * w
+
+ b, _, c = q.size()
+
+ b_new = b * num_splits * h
+
+ window_size_w = w // num_splits
+
+ q = q.view(b * h, w, c) # [B*H, W, C]
+ k = k.view(b * h, w, c)
+ v = v.view(b * h, w, c)
+
+ scale_factor = c ** 0.5
+
+ if with_shift:
+ assert attn_mask is not None # compute once
+ shift_size_w = window_size_w // 2
+
+ q = torch.roll(q, shifts=-shift_size_w, dims=1)
+ k = torch.roll(k, shifts=-shift_size_w, dims=1)
+ v = torch.roll(v, shifts=-shift_size_w, dims=1)
+
+ q = split_feature_1d(q, num_splits=num_splits) # [B*H*K, W/K, C]
+ k = split_feature_1d(k, num_splits=num_splits)
+ v = split_feature_1d(v, num_splits=num_splits)
+
+ scores = torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)
+ ) / scale_factor # [B*H*K, W/K, W/K]
+
+ if with_shift:
+ # attn_mask: [K, W/K, W/K]
+ scores += attn_mask.repeat(b * h, 1, 1) # [B*H*K, W/K, W/K]
+
+ attn = torch.softmax(scores, dim=-1)
+
+ out = torch.matmul(attn, v.view(b_new, -1, c)) # [B*H*K, W/K, C]
+
+ out = merge_splits_1d(out, h, num_splits=num_splits) # [B, H, W, C]
+
+ # shift back
+ if with_shift:
+ out = torch.roll(out, shifts=shift_size_w, dims=2)
+
+ out = out.view(b, -1, c)
+
+ return out
+
+
+class SelfAttnPropagation(nn.Module):
+ """
+ flow propagation with self-attention on feature
+ query: feature0, key: feature0, value: flow
+ """
+
+ def __init__(self, in_channels,
+ **kwargs,
+ ):
+ super(SelfAttnPropagation, self).__init__()
+
+ self.q_proj = nn.Linear(in_channels, in_channels)
+ self.k_proj = nn.Linear(in_channels, in_channels)
+
+ for p in self.parameters():
+ if p.dim() > 1:
+ nn.init.xavier_uniform_(p)
+
+ def forward(self, feature0, flow,
+ local_window_attn=False,
+ local_window_radius=1,
+ **kwargs,
+ ):
+ # q, k: feature [B, C, H, W], v: flow [B, 2, H, W]
+ if local_window_attn:
+ return self.forward_local_window_attn(feature0, flow,
+ local_window_radius=local_window_radius)
+
+ b, c, h, w = feature0.size()
+
+ query = feature0.view(b, c, h * w).permute(0, 2, 1) # [B, H*W, C]
+
+ # a note: the ``correct'' implementation should be:
+ # ``query = self.q_proj(query), key = self.k_proj(query)''
+ # this problem is observed while cleaning up the code
+ # however, this doesn't affect the performance since the projection is a linear operation,
+ # thus the two projection matrices for key can be merged
+ # so I just leave it as is in order to not re-train all models :)
+ query = self.q_proj(query) # [B, H*W, C]
+ key = self.k_proj(query) # [B, H*W, C]
+
+ value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1) # [B, H*W, 2]
+
+ scores = torch.matmul(query, key.permute(0, 2, 1)) / (c ** 0.5) # [B, H*W, H*W]
+ prob = torch.softmax(scores, dim=-1)
+
+ out = torch.matmul(prob, value) # [B, H*W, 2]
+ out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2) # [B, 2, H, W]
+
+ return out
+
+ def forward_local_window_attn(self, feature0, flow,
+ local_window_radius=1,
+ ):
+ assert flow.size(1) == 2 or flow.size(1) == 1 # flow or disparity or depth
+ assert local_window_radius > 0
+
+ b, c, h, w = feature0.size()
+
+ value_channel = flow.size(1)
+
+ feature0_reshape = self.q_proj(feature0.view(b, c, -1).permute(0, 2, 1)
+ ).reshape(b * h * w, 1, c) # [B*H*W, 1, C]
+
+ kernel_size = 2 * local_window_radius + 1
+
+ feature0_proj = self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1)).permute(0, 2, 1).reshape(b, c, h, w)
+
+ feature0_window = F.unfold(feature0_proj, kernel_size=kernel_size,
+ padding=local_window_radius) # [B, C*(2R+1)^2), H*W]
+
+ feature0_window = feature0_window.view(b, c, kernel_size ** 2, h, w).permute(
+ 0, 3, 4, 1, 2).reshape(b * h * w, c, kernel_size ** 2) # [B*H*W, C, (2R+1)^2]
+
+ flow_window = F.unfold(flow, kernel_size=kernel_size,
+ padding=local_window_radius) # [B, 2*(2R+1)^2), H*W]
+
+ flow_window = flow_window.view(b, value_channel, kernel_size ** 2, h, w).permute(
+ 0, 3, 4, 2, 1).reshape(b * h * w, kernel_size ** 2, value_channel) # [B*H*W, (2R+1)^2, 2]
+
+ scores = torch.matmul(feature0_reshape, feature0_window) / (c ** 0.5) # [B*H*W, 1, (2R+1)^2]
+
+ prob = torch.softmax(scores, dim=-1)
+
+ out = torch.matmul(prob, flow_window).view(b, h, w, value_channel
+ ).permute(0, 3, 1, 2).contiguous() # [B, 2, H, W]
+
+ return out
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/backbone.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5c92b7d8698a41d11b29f084b3ab4953dd2a7bd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/backbone.py
@@ -0,0 +1,117 @@
+import torch.nn as nn
+
+from .trident_conv import MultiScaleTridentConv
+
+
+class ResidualBlock(nn.Module):
+ def __init__(self, in_planes, planes, norm_layer=nn.InstanceNorm2d, stride=1, dilation=1,
+ ):
+ super(ResidualBlock, self).__init__()
+
+ self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3,
+ dilation=dilation, padding=dilation, stride=stride, bias=False)
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+ dilation=dilation, padding=dilation, bias=False)
+ self.relu = nn.ReLU(inplace=True)
+
+ self.norm1 = norm_layer(planes)
+ self.norm2 = norm_layer(planes)
+ if not stride == 1 or in_planes != planes:
+ self.norm3 = norm_layer(planes)
+
+ if stride == 1 and in_planes == planes:
+ self.downsample = None
+ else:
+ self.downsample = nn.Sequential(
+ nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+
+ def forward(self, x):
+ y = x
+ y = self.relu(self.norm1(self.conv1(y)))
+ y = self.relu(self.norm2(self.conv2(y)))
+
+ if self.downsample is not None:
+ x = self.downsample(x)
+
+ return self.relu(x + y)
+
+
+class CNNEncoder(nn.Module):
+ def __init__(self, output_dim=128,
+ norm_layer=nn.InstanceNorm2d,
+ num_output_scales=1,
+ **kwargs,
+ ):
+ super(CNNEncoder, self).__init__()
+ self.num_branch = num_output_scales
+
+ feature_dims = [64, 96, 128]
+
+ self.conv1 = nn.Conv2d(3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False) # 1/2
+ self.norm1 = norm_layer(feature_dims[0])
+ self.relu1 = nn.ReLU(inplace=True)
+
+ self.in_planes = feature_dims[0]
+ self.layer1 = self._make_layer(feature_dims[0], stride=1, norm_layer=norm_layer) # 1/2
+ self.layer2 = self._make_layer(feature_dims[1], stride=2, norm_layer=norm_layer) # 1/4
+
+ # highest resolution 1/4 or 1/8
+ stride = 2 if num_output_scales == 1 else 1
+ self.layer3 = self._make_layer(feature_dims[2], stride=stride,
+ norm_layer=norm_layer,
+ ) # 1/4 or 1/8
+
+ self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0)
+
+ if self.num_branch > 1:
+ if self.num_branch == 4:
+ strides = (1, 2, 4, 8)
+ elif self.num_branch == 3:
+ strides = (1, 2, 4)
+ elif self.num_branch == 2:
+ strides = (1, 2)
+ else:
+ raise ValueError
+
+ self.trident_conv = MultiScaleTridentConv(output_dim, output_dim,
+ kernel_size=3,
+ strides=strides,
+ paddings=1,
+ num_branch=self.num_branch,
+ )
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+ elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+ if m.weight is not None:
+ nn.init.constant_(m.weight, 1)
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+
+ def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d):
+ layer1 = ResidualBlock(self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation)
+ layer2 = ResidualBlock(dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation)
+
+ layers = (layer1, layer2)
+
+ self.in_planes = dim
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.norm1(x)
+ x = self.relu1(x)
+
+ x = self.layer1(x) # 1/2
+ x = self.layer2(x) # 1/4
+ x = self.layer3(x) # 1/8 or 1/4
+
+ x = self.conv2(x)
+
+ if self.num_branch > 1:
+ out = self.trident_conv([x] * self.num_branch) # high to low res
+ else:
+ out = [x]
+
+ return out
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/geometry.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..42231426d998d8556e53eaee249efe3e946a5bb0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/geometry.py
@@ -0,0 +1,195 @@
+import torch
+import torch.nn.functional as F
+
+
+def coords_grid(b, h, w, homogeneous=False, device=None):
+ y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) # [H, W]
+
+ stacks = [x, y]
+
+ if homogeneous:
+ ones = torch.ones_like(x) # [H, W]
+ stacks.append(ones)
+
+ grid = torch.stack(stacks, dim=0).float() # [2, H, W] or [3, H, W]
+
+ grid = grid[None].repeat(b, 1, 1, 1) # [B, 2, H, W] or [B, 3, H, W]
+
+ if device is not None:
+ grid = grid.to(device)
+
+ return grid
+
+
+def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
+ assert device is not None
+
+ x, y = torch.meshgrid([torch.linspace(w_min, w_max, len_w, device=device),
+ torch.linspace(h_min, h_max, len_h, device=device)],
+ )
+ grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2]
+
+ return grid
+
+
+def normalize_coords(coords, h, w):
+ # coords: [B, H, W, 2]
+ c = torch.Tensor([(w - 1) / 2., (h - 1) / 2.]).float().to(coords.device)
+ return (coords - c) / c # [-1, 1]
+
+
+def bilinear_sample(img, sample_coords, mode='bilinear', padding_mode='zeros', return_mask=False):
+ # img: [B, C, H, W]
+ # sample_coords: [B, 2, H, W] in image scale
+ if sample_coords.size(1) != 2: # [B, H, W, 2]
+ sample_coords = sample_coords.permute(0, 3, 1, 2)
+
+ b, _, h, w = sample_coords.shape
+
+ # Normalize to [-1, 1]
+ x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+ y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+
+ grid = torch.stack([x_grid, y_grid], dim=-1) # [B, H, W, 2]
+
+ img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+
+ if return_mask:
+ mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1) # [B, H, W]
+
+ return img, mask
+
+ return img
+
+
+def flow_warp(feature, flow, mask=False, padding_mode='zeros'):
+ b, c, h, w = feature.size()
+ assert flow.size(1) == 2
+
+ grid = coords_grid(b, h, w).to(flow.device) + flow # [B, 2, H, W]
+
+ return bilinear_sample(feature, grid, padding_mode=padding_mode,
+ return_mask=mask)
+
+
+def forward_backward_consistency_check(fwd_flow, bwd_flow,
+ alpha=0.01,
+ beta=0.5
+ ):
+ # fwd_flow, bwd_flow: [B, 2, H, W]
+ # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
+ assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+ assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+ flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1) # [B, H, W]
+
+ warped_bwd_flow = flow_warp(bwd_flow, fwd_flow) # [B, 2, H, W]
+ warped_fwd_flow = flow_warp(fwd_flow, bwd_flow) # [B, 2, H, W]
+
+ diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1) # [B, H, W]
+ diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+
+ threshold = alpha * flow_mag + beta
+
+ fwd_occ = (diff_fwd > threshold).float() # [B, H, W]
+ bwd_occ = (diff_bwd > threshold).float()
+
+ return fwd_occ, bwd_occ
+
+
+def back_project(depth, intrinsics):
+ # Back project 2D pixel coords to 3D points
+ # depth: [B, H, W]
+ # intrinsics: [B, 3, 3]
+ b, h, w = depth.shape
+ grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W]
+
+ intrinsics_inv = torch.inverse(intrinsics) # [B, 3, 3]
+
+ points = intrinsics_inv.bmm(grid.view(b, 3, -1)).view(b, 3, h, w) * depth.unsqueeze(1) # [B, 3, H, W]
+
+ return points
+
+
+def camera_transform(points_ref, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None):
+ # Transform 3D points from reference camera to target camera
+ # points_ref: [B, 3, H, W]
+ # extrinsics_ref: [B, 4, 4]
+ # extrinsics_tgt: [B, 4, 4]
+ # extrinsics_rel: [B, 4, 4], relative pose transform
+ b, _, h, w = points_ref.shape
+
+ if extrinsics_rel is None:
+ extrinsics_rel = torch.bmm(extrinsics_tgt, torch.inverse(extrinsics_ref)) # [B, 4, 4]
+
+ points_tgt = torch.bmm(extrinsics_rel[:, :3, :3],
+ points_ref.view(b, 3, -1)) + extrinsics_rel[:, :3, -1:] # [B, 3, H*W]
+
+ points_tgt = points_tgt.view(b, 3, h, w) # [B, 3, H, W]
+
+ return points_tgt
+
+
+def reproject(points_tgt, intrinsics, return_mask=False):
+ # reproject to target view
+ # points_tgt: [B, 3, H, W]
+ # intrinsics: [B, 3, 3]
+
+ b, _, h, w = points_tgt.shape
+
+ proj_points = torch.bmm(intrinsics, points_tgt.view(b, 3, -1)).view(b, 3, h, w) # [B, 3, H, W]
+
+ X = proj_points[:, 0]
+ Y = proj_points[:, 1]
+ Z = proj_points[:, 2].clamp(min=1e-3)
+
+ pixel_coords = torch.stack([X / Z, Y / Z], dim=1).view(b, 2, h, w) # [B, 2, H, W] in image scale
+
+ if return_mask:
+ # valid mask in pixel space
+ mask = (pixel_coords[:, 0] >= 0) & (pixel_coords[:, 0] <= (w - 1)) & (
+ pixel_coords[:, 1] >= 0) & (pixel_coords[:, 1] <= (h - 1)) # [B, H, W]
+
+ return pixel_coords, mask
+
+ return pixel_coords
+
+
+def reproject_coords(depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None,
+ return_mask=False):
+ # Compute reprojection sample coords
+ points_ref = back_project(depth_ref, intrinsics) # [B, 3, H, W]
+ points_tgt = camera_transform(points_ref, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel)
+
+ if return_mask:
+ reproj_coords, mask = reproject(points_tgt, intrinsics,
+ return_mask=return_mask) # [B, 2, H, W] in image scale
+
+ return reproj_coords, mask
+
+ reproj_coords = reproject(points_tgt, intrinsics,
+ return_mask=return_mask) # [B, 2, H, W] in image scale
+
+ return reproj_coords
+
+
+def compute_flow_with_depth_pose(depth_ref, intrinsics,
+ extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None,
+ return_mask=False):
+ b, h, w = depth_ref.shape
+ coords_init = coords_grid(b, h, w, device=depth_ref.device) # [B, 2, H, W]
+
+ if return_mask:
+ reproj_coords, mask = reproject_coords(depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt,
+ extrinsics_rel=extrinsics_rel,
+ return_mask=return_mask) # [B, 2, H, W]
+ rigid_flow = reproj_coords - coords_init
+
+ return rigid_flow, mask
+
+ reproj_coords = reproject_coords(depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt,
+ extrinsics_rel=extrinsics_rel,
+ return_mask=return_mask) # [B, 2, H, W]
+
+ rigid_flow = reproj_coords - coords_init
+
+ return rigid_flow
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/matching.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/matching.py
new file mode 100644
index 0000000000000000000000000000000000000000..647102547901d7c7a611371ca57061268ba0ad58
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/matching.py
@@ -0,0 +1,279 @@
+import torch
+import torch.nn.functional as F
+
+from .geometry import coords_grid, generate_window_grid, normalize_coords
+
+
+def global_correlation_softmax(feature0, feature1,
+ pred_bidir_flow=False,
+ ):
+ # global correlation
+ b, c, h, w = feature0.shape
+ feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C]
+ feature1 = feature1.view(b, c, -1) # [B, C, H*W]
+
+ correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (c ** 0.5) # [B, H, W, H, W]
+
+ # flow from softmax
+ init_grid = coords_grid(b, h, w).to(correlation.device) # [B, 2, H, W]
+ grid = init_grid.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]
+
+ correlation = correlation.view(b, h * w, h * w) # [B, H*W, H*W]
+
+ if pred_bidir_flow:
+ correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0) # [2*B, H*W, H*W]
+ init_grid = init_grid.repeat(2, 1, 1, 1) # [2*B, 2, H, W]
+ grid = grid.repeat(2, 1, 1) # [2*B, H*W, 2]
+ b = b * 2
+
+ prob = F.softmax(correlation, dim=-1) # [B, H*W, H*W]
+
+ correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W]
+
+ # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
+ flow = correspondence - init_grid
+
+ return flow, prob
+
+
+def local_correlation_softmax(feature0, feature1, local_radius,
+ padding_mode='zeros',
+ ):
+ b, c, h, w = feature0.size()
+ coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
+ coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]
+
+ local_h = 2 * local_radius + 1
+ local_w = 2 * local_radius + 1
+
+ window_grid = generate_window_grid(-local_radius, local_radius,
+ -local_radius, local_radius,
+ local_h, local_w, device=feature0.device) # [2R+1, 2R+1, 2]
+ window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2]
+ sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1)^2, 2]
+
+ sample_coords_softmax = sample_coords
+
+ # exclude coords that are out of image space
+ valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2]
+ valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2]
+
+ valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
+
+ # normalize coordinates to [-1, 1]
+ sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
+ window_feature = F.grid_sample(feature1, sample_coords_norm,
+ padding_mode=padding_mode, align_corners=True
+ ).permute(0, 2, 1, 3) # [B, H*W, C, (2R+1)^2]
+ feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C]
+
+ corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c ** 0.5) # [B, H*W, (2R+1)^2]
+
+ # mask invalid locations
+ corr[~valid] = -1e9
+
+ prob = F.softmax(corr, -1) # [B, H*W, (2R+1)^2]
+
+ correspondence = torch.matmul(prob.unsqueeze(-2), sample_coords_softmax).squeeze(-2).view(
+ b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W]
+
+ flow = correspondence - coords_init
+ match_prob = prob
+
+ return flow, match_prob
+
+
+def local_correlation_with_flow(feature0, feature1,
+ flow,
+ local_radius,
+ padding_mode='zeros',
+ dilation=1,
+ ):
+ b, c, h, w = feature0.size()
+ coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
+ coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]
+
+ local_h = 2 * local_radius + 1
+ local_w = 2 * local_radius + 1
+
+ window_grid = generate_window_grid(-local_radius, local_radius,
+ -local_radius, local_radius,
+ local_h, local_w, device=feature0.device) # [2R+1, 2R+1, 2]
+ window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2]
+ sample_coords = coords.unsqueeze(-2) + window_grid * dilation # [B, H*W, (2R+1)^2, 2]
+
+ # flow can be zero when using features after transformer
+ if not isinstance(flow, float):
+ sample_coords = sample_coords + flow.view(
+ b, 2, -1).permute(0, 2, 1).unsqueeze(-2) # [B, H*W, (2R+1)^2, 2]
+ else:
+ assert flow == 0.
+
+ # normalize coordinates to [-1, 1]
+ sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
+ window_feature = F.grid_sample(feature1, sample_coords_norm,
+ padding_mode=padding_mode, align_corners=True
+ ).permute(0, 2, 1, 3) # [B, H*W, C, (2R+1)^2]
+ feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C]
+
+ corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c ** 0.5) # [B, H*W, (2R+1)^2]
+
+ corr = corr.view(b, h, w, -1).permute(0, 3, 1, 2).contiguous() # [B, (2R+1)^2, H, W]
+
+ return corr
+
+
+def global_correlation_softmax_stereo(feature0, feature1,
+ ):
+ # global correlation on horizontal direction
+ b, c, h, w = feature0.shape
+
+ x_grid = torch.linspace(0, w - 1, w, device=feature0.device) # [W]
+
+ feature0 = feature0.permute(0, 2, 3, 1) # [B, H, W, C]
+ feature1 = feature1.permute(0, 2, 1, 3) # [B, H, C, W]
+
+ correlation = torch.matmul(feature0, feature1) / (c ** 0.5) # [B, H, W, W]
+
+ # mask subsequent positions to make disparity positive
+ mask = torch.triu(torch.ones((w, w)), diagonal=1).type_as(feature0) # [W, W]
+ valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(b, h, 1, 1) # [B, H, W, W]
+
+ correlation[~valid_mask] = -1e9
+
+ prob = F.softmax(correlation, dim=-1) # [B, H, W, W]
+
+ correspondence = (x_grid.view(1, 1, 1, w) * prob).sum(-1) # [B, H, W]
+
+ # NOTE: unlike flow, disparity is typically positive
+ disparity = x_grid.view(1, 1, w).repeat(b, h, 1) - correspondence # [B, H, W]
+
+ return disparity.unsqueeze(1), prob # feature resolution
+
+
+def local_correlation_softmax_stereo(feature0, feature1, local_radius,
+ ):
+ b, c, h, w = feature0.size()
+ coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
+ coords = coords_init.view(b, 2, -1).permute(0, 2, 1).contiguous() # [B, H*W, 2]
+
+ local_h = 1
+ local_w = 2 * local_radius + 1
+
+ window_grid = generate_window_grid(0, 0,
+ -local_radius, local_radius,
+ local_h, local_w, device=feature0.device) # [1, 2R+1, 2]
+ window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1), 2]
+ sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1), 2]
+
+ sample_coords_softmax = sample_coords
+
+ # exclude coords that are out of image space
+ valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2]
+ valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2]
+
+ valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
+
+ # normalize coordinates to [-1, 1]
+ sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
+ window_feature = F.grid_sample(feature1, sample_coords_norm,
+ padding_mode='zeros', align_corners=True
+ ).permute(0, 2, 1, 3) # [B, H*W, C, (2R+1)]
+ feature0_view = feature0.permute(0, 2, 3, 1).contiguous().view(b, h * w, 1, c) # [B, H*W, 1, C]
+
+ corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c ** 0.5) # [B, H*W, (2R+1)]
+
+ # mask invalid locations
+ corr[~valid] = -1e9
+
+ prob = F.softmax(corr, -1) # [B, H*W, (2R+1)]
+
+ correspondence = torch.matmul(prob.unsqueeze(-2),
+ sample_coords_softmax).squeeze(-2).view(
+ b, h, w, 2).permute(0, 3, 1, 2).contiguous() # [B, 2, H, W]
+
+ flow = correspondence - coords_init # flow at feature resolution
+ match_prob = prob
+
+ flow_x = -flow[:, :1] # [B, 1, H, W]
+
+ return flow_x, match_prob
+
+
+def correlation_softmax_depth(feature0, feature1,
+ intrinsics,
+ pose,
+ depth_candidates,
+ depth_from_argmax=False,
+ pred_bidir_depth=False,
+ ):
+ b, c, h, w = feature0.size()
+ assert depth_candidates.dim() == 4 # [B, D, H, W]
+ scale_factor = c ** 0.5
+
+ if pred_bidir_depth:
+ feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
+ intrinsics = intrinsics.repeat(2, 1, 1)
+ pose = torch.cat((pose, torch.inverse(pose)), dim=0)
+ depth_candidates = depth_candidates.repeat(2, 1, 1, 1)
+
+ # depth candidates are actually inverse depth
+ warped_feature1 = warp_with_pose_depth_candidates(feature1, intrinsics, pose,
+ 1. / depth_candidates,
+ ) # [B, C, D, H, W]
+
+ correlation = (feature0.unsqueeze(2) * warped_feature1).sum(1) / scale_factor # [B, D, H, W]
+
+ match_prob = F.softmax(correlation, dim=1) # [B, D, H, W]
+
+ # for cross-task transfer (flow -> depth), extract depth with argmax at test time
+ if depth_from_argmax:
+ index = torch.argmax(match_prob, dim=1, keepdim=True)
+ depth = torch.gather(depth_candidates, dim=1, index=index)
+ else:
+ depth = (match_prob * depth_candidates).sum(dim=1, keepdim=True) # [B, 1, H, W]
+
+ return depth, match_prob
+
+
+def warp_with_pose_depth_candidates(feature1, intrinsics, pose, depth,
+ clamp_min_depth=1e-3,
+ ):
+ """
+ feature1: [B, C, H, W]
+ intrinsics: [B, 3, 3]
+ pose: [B, 4, 4]
+ depth: [B, D, H, W]
+ """
+
+ assert intrinsics.size(1) == intrinsics.size(2) == 3
+ assert pose.size(1) == pose.size(2) == 4
+ assert depth.dim() == 4
+
+ b, d, h, w = depth.size()
+ c = feature1.size(1)
+
+ with torch.no_grad():
+ # pixel coordinates
+ grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W]
+ # back project to 3D and transform viewpoint
+ points = torch.inverse(intrinsics).bmm(grid.view(b, 3, -1)) # [B, 3, H*W]
+ points = torch.bmm(pose[:, :3, :3], points).unsqueeze(2).repeat(
+ 1, 1, d, 1) * depth.view(b, 1, d, h * w) # [B, 3, D, H*W]
+ points = points + pose[:, :3, -1:].unsqueeze(-1) # [B, 3, D, H*W]
+ # reproject to 2D image plane
+ points = torch.bmm(intrinsics, points.view(b, 3, -1)).view(b, 3, d, h * w) # [B, 3, D, H*W]
+ pixel_coords = points[:, :2] / points[:, -1:].clamp(min=clamp_min_depth) # [B, 2, D, H*W]
+
+ # normalize to [-1, 1]
+ x_grid = 2 * pixel_coords[:, 0] / (w - 1) - 1
+ y_grid = 2 * pixel_coords[:, 1] / (h - 1) - 1
+
+ grid = torch.stack([x_grid, y_grid], dim=-1) # [B, D, H*W, 2]
+
+ # sample features
+ warped_feature = F.grid_sample(feature1, grid.view(b, d * h, w, 2), mode='bilinear',
+ padding_mode='zeros',
+ align_corners=True).view(b, c, d, h, w) # [B, C, D, H, W]
+
+ return warped_feature
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/position.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/position.py
new file mode 100644
index 0000000000000000000000000000000000000000..42435d0fef24737d3cae7463ca411a635979cf33
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/position.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py
+
+import torch
+import torch.nn as nn
+import math
+
+
+class PositionEmbeddingSine(nn.Module):
+ """
+ This is a more standard version of the position embedding, very similar to the one
+ used by the Attention is all you need paper, generalized to work on images.
+ """
+
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
+ super().__init__()
+ self.num_pos_feats = num_pos_feats
+ self.temperature = temperature
+ self.normalize = normalize
+ if scale is not None and normalize is False:
+ raise ValueError("normalize should be True if scale is passed")
+ if scale is None:
+ scale = 2 * math.pi
+ self.scale = scale
+
+ def forward(self, x):
+ # x = tensor_list.tensors # [B, C, H, W]
+ # mask = tensor_list.mask # [B, H, W], input with padding, valid as 0
+ b, c, h, w = x.size()
+ mask = torch.ones((b, h, w), device=x.device) # [B, H, W]
+ y_embed = mask.cumsum(1, dtype=torch.float32)
+ x_embed = mask.cumsum(2, dtype=torch.float32)
+ if self.normalize:
+ eps = 1e-6
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+ pos_x = x_embed[:, :, :, None] / dim_t
+ pos_y = y_embed[:, :, :, None] / dim_t
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+ return pos
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/reg_refine.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/reg_refine.py
new file mode 100644
index 0000000000000000000000000000000000000000..310807b9582063d733fe05a6f00d0b6690d8545c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/reg_refine.py
@@ -0,0 +1,119 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowHead(nn.Module):
+ def __init__(self, input_dim=128, hidden_dim=256,
+ out_dim=2,
+ ):
+ super(FlowHead, self).__init__()
+
+ self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+ self.conv2 = nn.Conv2d(hidden_dim, out_dim, 3, padding=1)
+ self.relu = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ out = self.conv2(self.relu(self.conv1(x)))
+
+ return out
+
+
+class SepConvGRU(nn.Module):
+ def __init__(self, hidden_dim=128, input_dim=192 + 128,
+ kernel_size=5,
+ ):
+ padding = (kernel_size - 1) // 2
+
+ super(SepConvGRU, self).__init__()
+ self.convz1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
+ self.convr1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
+ self.convq1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
+
+ self.convz2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
+ self.convr2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
+ self.convq2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
+
+ def forward(self, h, x):
+ # horizontal
+ hx = torch.cat([h, x], dim=1)
+ z = torch.sigmoid(self.convz1(hx))
+ r = torch.sigmoid(self.convr1(hx))
+ q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+ h = (1 - z) * h + z * q
+
+ # vertical
+ hx = torch.cat([h, x], dim=1)
+ z = torch.sigmoid(self.convz2(hx))
+ r = torch.sigmoid(self.convr2(hx))
+ q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+ h = (1 - z) * h + z * q
+
+ return h
+
+
+class BasicMotionEncoder(nn.Module):
+ def __init__(self, corr_channels=324,
+ flow_channels=2,
+ ):
+ super(BasicMotionEncoder, self).__init__()
+
+ self.convc1 = nn.Conv2d(corr_channels, 256, 1, padding=0)
+ self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+ self.convf1 = nn.Conv2d(flow_channels, 128, 7, padding=3)
+ self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+ self.conv = nn.Conv2d(64 + 192, 128 - flow_channels, 3, padding=1)
+
+ def forward(self, flow, corr):
+ cor = F.relu(self.convc1(corr))
+ cor = F.relu(self.convc2(cor))
+ flo = F.relu(self.convf1(flow))
+ flo = F.relu(self.convf2(flo))
+
+ cor_flo = torch.cat([cor, flo], dim=1)
+ out = F.relu(self.conv(cor_flo))
+ return torch.cat([out, flow], dim=1)
+
+
+class BasicUpdateBlock(nn.Module):
+ def __init__(self, corr_channels=324,
+ hidden_dim=128,
+ context_dim=128,
+ downsample_factor=8,
+ flow_dim=2,
+ bilinear_up=False,
+ ):
+ super(BasicUpdateBlock, self).__init__()
+
+ self.encoder = BasicMotionEncoder(corr_channels=corr_channels,
+ flow_channels=flow_dim,
+ )
+
+ self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=context_dim + hidden_dim)
+
+ self.flow_head = FlowHead(hidden_dim, hidden_dim=256,
+ out_dim=flow_dim,
+ )
+
+ if bilinear_up:
+ self.mask = None
+ else:
+ self.mask = nn.Sequential(
+ nn.Conv2d(hidden_dim, 256, 3, padding=1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(256, downsample_factor ** 2 * 9, 1, padding=0))
+
+ def forward(self, net, inp, corr, flow):
+ motion_features = self.encoder(flow, corr)
+
+ inp = torch.cat([inp, motion_features], dim=1)
+
+ net = self.gru(net, inp)
+ delta_flow = self.flow_head(net)
+
+ if self.mask is not None:
+ mask = self.mask(net)
+ else:
+ mask = None
+
+ return net, mask, delta_flow
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/transformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a93660cacae9e36a8886f1e17a564ddb4b2644f6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/transformer.py
@@ -0,0 +1,294 @@
+import torch
+import torch.nn as nn
+
+from .attention import (single_head_full_attention, single_head_split_window_attention,
+ single_head_full_attention_1d, single_head_split_window_attention_1d)
+from .utils import generate_shift_window_attn_mask, generate_shift_window_attn_mask_1d
+
+
+class TransformerLayer(nn.Module):
+ def __init__(self,
+ d_model=128,
+ nhead=1,
+ no_ffn=False,
+ ffn_dim_expansion=4,
+ ):
+ super(TransformerLayer, self).__init__()
+
+ self.dim = d_model
+ self.nhead = nhead
+ self.no_ffn = no_ffn
+
+ # multi-head attention
+ self.q_proj = nn.Linear(d_model, d_model, bias=False)
+ self.k_proj = nn.Linear(d_model, d_model, bias=False)
+ self.v_proj = nn.Linear(d_model, d_model, bias=False)
+
+ self.merge = nn.Linear(d_model, d_model, bias=False)
+
+ self.norm1 = nn.LayerNorm(d_model)
+
+ # no ffn after self-attn, with ffn after cross-attn
+ if not self.no_ffn:
+ in_channels = d_model * 2
+ self.mlp = nn.Sequential(
+ nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False),
+ nn.GELU(),
+ nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False),
+ )
+
+ self.norm2 = nn.LayerNorm(d_model)
+
+ def forward(self, source, target,
+ height=None,
+ width=None,
+ shifted_window_attn_mask=None,
+ shifted_window_attn_mask_1d=None,
+ attn_type='swin',
+ with_shift=False,
+ attn_num_splits=None,
+ ):
+ # source, target: [B, L, C]
+ query, key, value = source, target, target
+
+ # for stereo: 2d attn in self-attn, 1d attn in cross-attn
+ is_self_attn = (query - key).abs().max() < 1e-6
+
+ # single-head attention
+ query = self.q_proj(query) # [B, L, C]
+ key = self.k_proj(key) # [B, L, C]
+ value = self.v_proj(value) # [B, L, C]
+
+ if attn_type == 'swin' and attn_num_splits > 1: # self, cross-attn: both swin 2d
+ if self.nhead > 1:
+ # we observe that multihead attention slows down the speed and increases the memory consumption
+ # without bringing obvious performance gains and thus the implementation is removed
+ raise NotImplementedError
+ else:
+ message = single_head_split_window_attention(query, key, value,
+ num_splits=attn_num_splits,
+ with_shift=with_shift,
+ h=height,
+ w=width,
+ attn_mask=shifted_window_attn_mask,
+ )
+
+ elif attn_type == 'self_swin2d_cross_1d': # self-attn: swin 2d, cross-attn: full 1d
+ if self.nhead > 1:
+ raise NotImplementedError
+ else:
+ if is_self_attn:
+ if attn_num_splits > 1:
+ message = single_head_split_window_attention(query, key, value,
+ num_splits=attn_num_splits,
+ with_shift=with_shift,
+ h=height,
+ w=width,
+ attn_mask=shifted_window_attn_mask,
+ )
+ else:
+ # full 2d attn
+ message = single_head_full_attention(query, key, value) # [N, L, C]
+
+ else:
+ # cross attn 1d
+ message = single_head_full_attention_1d(query, key, value,
+ h=height,
+ w=width,
+ )
+
+ elif attn_type == 'self_swin2d_cross_swin1d': # self-attn: swin 2d, cross-attn: swin 1d
+ if self.nhead > 1:
+ raise NotImplementedError
+ else:
+ if is_self_attn:
+ if attn_num_splits > 1:
+ # self attn shift window
+ message = single_head_split_window_attention(query, key, value,
+ num_splits=attn_num_splits,
+ with_shift=with_shift,
+ h=height,
+ w=width,
+ attn_mask=shifted_window_attn_mask,
+ )
+ else:
+ # full 2d attn
+ message = single_head_full_attention(query, key, value) # [N, L, C]
+ else:
+ if attn_num_splits > 1:
+ assert shifted_window_attn_mask_1d is not None
+ # cross attn 1d shift
+ message = single_head_split_window_attention_1d(query, key, value,
+ num_splits=attn_num_splits,
+ with_shift=with_shift,
+ h=height,
+ w=width,
+ attn_mask=shifted_window_attn_mask_1d,
+ )
+ else:
+ message = single_head_full_attention_1d(query, key, value,
+ h=height,
+ w=width,
+ )
+
+ else:
+ message = single_head_full_attention(query, key, value) # [B, L, C]
+
+ message = self.merge(message) # [B, L, C]
+ message = self.norm1(message)
+
+ if not self.no_ffn:
+ message = self.mlp(torch.cat([source, message], dim=-1))
+ message = self.norm2(message)
+
+ return source + message
+
+
+class TransformerBlock(nn.Module):
+ """self attention + cross attention + FFN"""
+
+ def __init__(self,
+ d_model=128,
+ nhead=1,
+ ffn_dim_expansion=4,
+ ):
+ super(TransformerBlock, self).__init__()
+
+ self.self_attn = TransformerLayer(d_model=d_model,
+ nhead=nhead,
+ no_ffn=True,
+ ffn_dim_expansion=ffn_dim_expansion,
+ )
+
+ self.cross_attn_ffn = TransformerLayer(d_model=d_model,
+ nhead=nhead,
+ ffn_dim_expansion=ffn_dim_expansion,
+ )
+
+ def forward(self, source, target,
+ height=None,
+ width=None,
+ shifted_window_attn_mask=None,
+ shifted_window_attn_mask_1d=None,
+ attn_type='swin',
+ with_shift=False,
+ attn_num_splits=None,
+ ):
+ # source, target: [B, L, C]
+
+ # self attention
+ source = self.self_attn(source, source,
+ height=height,
+ width=width,
+ shifted_window_attn_mask=shifted_window_attn_mask,
+ attn_type=attn_type,
+ with_shift=with_shift,
+ attn_num_splits=attn_num_splits,
+ )
+
+ # cross attention and ffn
+ source = self.cross_attn_ffn(source, target,
+ height=height,
+ width=width,
+ shifted_window_attn_mask=shifted_window_attn_mask,
+ shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
+ attn_type=attn_type,
+ with_shift=with_shift,
+ attn_num_splits=attn_num_splits,
+ )
+
+ return source
+
+
+class FeatureTransformer(nn.Module):
+ def __init__(self,
+ num_layers=6,
+ d_model=128,
+ nhead=1,
+ ffn_dim_expansion=4,
+ ):
+ super(FeatureTransformer, self).__init__()
+
+ self.d_model = d_model
+ self.nhead = nhead
+
+ self.layers = nn.ModuleList([
+ TransformerBlock(d_model=d_model,
+ nhead=nhead,
+ ffn_dim_expansion=ffn_dim_expansion,
+ )
+ for i in range(num_layers)])
+
+ for p in self.parameters():
+ if p.dim() > 1:
+ nn.init.xavier_uniform_(p)
+
+ def forward(self, feature0, feature1,
+ attn_type='swin',
+ attn_num_splits=None,
+ **kwargs,
+ ):
+
+ b, c, h, w = feature0.shape
+ assert self.d_model == c
+
+ feature0 = feature0.flatten(-2).permute(0, 2, 1) # [B, H*W, C]
+ feature1 = feature1.flatten(-2).permute(0, 2, 1) # [B, H*W, C]
+
+ # 2d attention
+ if 'swin' in attn_type and attn_num_splits > 1:
+ # global and refine use different number of splits
+ window_size_h = h // attn_num_splits
+ window_size_w = w // attn_num_splits
+
+ # compute attn mask once
+ shifted_window_attn_mask = generate_shift_window_attn_mask(
+ input_resolution=(h, w),
+ window_size_h=window_size_h,
+ window_size_w=window_size_w,
+ shift_size_h=window_size_h // 2,
+ shift_size_w=window_size_w // 2,
+ device=feature0.device,
+ ) # [K*K, H/K*W/K, H/K*W/K]
+ else:
+ shifted_window_attn_mask = None
+
+ # 1d attention
+ if 'swin1d' in attn_type and attn_num_splits > 1:
+ window_size_w = w // attn_num_splits
+
+ # compute attn mask once
+ shifted_window_attn_mask_1d = generate_shift_window_attn_mask_1d(
+ input_w=w,
+ window_size_w=window_size_w,
+ shift_size_w=window_size_w // 2,
+ device=feature0.device,
+ ) # [K, W/K, W/K]
+ else:
+ shifted_window_attn_mask_1d = None
+
+ # concat feature0 and feature1 in batch dimension to compute in parallel
+ concat0 = torch.cat((feature0, feature1), dim=0) # [2B, H*W, C]
+ concat1 = torch.cat((feature1, feature0), dim=0) # [2B, H*W, C]
+
+ for i, layer in enumerate(self.layers):
+ concat0 = layer(concat0, concat1,
+ height=h,
+ width=w,
+ attn_type=attn_type,
+ with_shift='swin' in attn_type and attn_num_splits > 1 and i % 2 == 1,
+ attn_num_splits=attn_num_splits,
+ shifted_window_attn_mask=shifted_window_attn_mask,
+ shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
+ )
+
+ # update feature1
+ concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0)
+
+ feature0, feature1 = concat0.chunk(chunks=2, dim=0) # [B, H*W, C]
+
+ # reshape back
+ feature0 = feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous() # [B, C, H, W]
+ feature1 = feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous() # [B, C, H, W]
+
+ return feature0, feature1
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/trident_conv.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/trident_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..445663c2d1065e10899f728ad2628e313f218024
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/trident_conv.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# https://github.com/facebookresearch/detectron2/blob/main/projects/TridentNet/tridentnet/trident_conv.py
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+
+
+class MultiScaleTridentConv(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ strides=1,
+ paddings=0,
+ dilations=1,
+ dilation=1,
+ groups=1,
+ num_branch=1,
+ test_branch_idx=-1,
+ bias=False,
+ norm=None,
+ activation=None,
+ ):
+ super(MultiScaleTridentConv, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = _pair(kernel_size)
+ self.num_branch = num_branch
+ self.stride = _pair(stride)
+ self.groups = groups
+ self.with_bias = bias
+ self.dilation = dilation
+ if isinstance(paddings, int):
+ paddings = [paddings] * self.num_branch
+ if isinstance(dilations, int):
+ dilations = [dilations] * self.num_branch
+ if isinstance(strides, int):
+ strides = [strides] * self.num_branch
+ self.paddings = [_pair(padding) for padding in paddings]
+ self.dilations = [_pair(dilation) for dilation in dilations]
+ self.strides = [_pair(stride) for stride in strides]
+ self.test_branch_idx = test_branch_idx
+ self.norm = norm
+ self.activation = activation
+
+ assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1
+
+ self.weight = nn.Parameter(
+ torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+ )
+ if bias:
+ self.bias = nn.Parameter(torch.Tensor(out_channels))
+ else:
+ self.bias = None
+
+ nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+ if self.bias is not None:
+ nn.init.constant_(self.bias, 0)
+
+ def forward(self, inputs):
+ num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
+ assert len(inputs) == num_branch
+
+ if self.training or self.test_branch_idx == -1:
+ outputs = [
+ F.conv2d(input, self.weight, self.bias, stride, padding, self.dilation, self.groups)
+ for input, stride, padding in zip(inputs, self.strides, self.paddings)
+ ]
+ else:
+ outputs = [
+ F.conv2d(
+ inputs[0],
+ self.weight,
+ self.bias,
+ self.strides[self.test_branch_idx] if self.test_branch_idx == -1 else self.strides[-1],
+ self.paddings[self.test_branch_idx] if self.test_branch_idx == -1 else self.paddings[-1],
+ self.dilation,
+ self.groups,
+ )
+ ]
+
+ if self.norm is not None:
+ outputs = [self.norm(x) for x in outputs]
+ if self.activation is not None:
+ outputs = [self.activation(x) for x in outputs]
+ return outputs
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/unimatch.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/unimatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..96db16e3c886033a0c59f0d01bc34ea42ce0e42b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/unimatch.py
@@ -0,0 +1,367 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .backbone import CNNEncoder
+from .transformer import FeatureTransformer
+from .matching import (global_correlation_softmax, local_correlation_softmax, local_correlation_with_flow,
+ global_correlation_softmax_stereo, local_correlation_softmax_stereo,
+ correlation_softmax_depth)
+from .attention import SelfAttnPropagation
+from .geometry import flow_warp, compute_flow_with_depth_pose
+from .reg_refine import BasicUpdateBlock
+from .utils import normalize_img, feature_add_position, upsample_flow_with_mask
+
+
+class UniMatch(nn.Module):
+ def __init__(self,
+ num_scales=1,
+ feature_channels=128,
+ upsample_factor=8,
+ num_head=1,
+ ffn_dim_expansion=4,
+ num_transformer_layers=6,
+ reg_refine=False, # optional local regression refinement
+ task='flow',
+ ):
+ super(UniMatch, self).__init__()
+
+ self.feature_channels = feature_channels
+ self.num_scales = num_scales
+ self.upsample_factor = upsample_factor
+ self.reg_refine = reg_refine
+
+ # CNN
+ self.backbone = CNNEncoder(output_dim=feature_channels, num_output_scales=num_scales)
+
+ # Transformer
+ self.transformer = FeatureTransformer(num_layers=num_transformer_layers,
+ d_model=feature_channels,
+ nhead=num_head,
+ ffn_dim_expansion=ffn_dim_expansion,
+ )
+
+ # propagation with self-attn
+ self.feature_flow_attn = SelfAttnPropagation(in_channels=feature_channels)
+
+ if not self.reg_refine or task == 'depth':
+ # convex upsampling simiar to RAFT
+ # concat feature0 and low res flow as input
+ self.upsampler = nn.Sequential(nn.Conv2d(2 + feature_channels, 256, 3, 1, 1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(256, upsample_factor ** 2 * 9, 1, 1, 0))
+ # thus far, all the learnable parameters are task-agnostic
+
+ if reg_refine:
+ # optional task-specific local regression refinement
+ self.refine_proj = nn.Conv2d(128, 256, 1)
+ self.refine = BasicUpdateBlock(corr_channels=(2 * 4 + 1) ** 2,
+ downsample_factor=upsample_factor,
+ flow_dim=2 if task == 'flow' else 1,
+ bilinear_up=task == 'depth',
+ )
+
+ def extract_feature(self, img0, img1):
+ concat = torch.cat((img0, img1), dim=0) # [2B, C, H, W]
+ features = self.backbone(concat) # list of [2B, C, H, W], resolution from high to low
+
+ # reverse: resolution from low to high
+ features = features[::-1]
+
+ feature0, feature1 = [], []
+
+ for i in range(len(features)):
+ feature = features[i]
+ chunks = torch.chunk(feature, 2, 0) # tuple
+ feature0.append(chunks[0])
+ feature1.append(chunks[1])
+
+ return feature0, feature1
+
+ def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8,
+ is_depth=False):
+ if bilinear:
+ multiplier = 1 if is_depth else upsample_factor
+ up_flow = F.interpolate(flow, scale_factor=upsample_factor,
+ mode='bilinear', align_corners=True) * multiplier
+ else:
+ concat = torch.cat((flow, feature), dim=1)
+ mask = self.upsampler(concat)
+ up_flow = upsample_flow_with_mask(flow, mask, upsample_factor=self.upsample_factor,
+ is_depth=is_depth)
+
+ return up_flow
+
+ def forward(self, img0, img1,
+ attn_type=None,
+ attn_splits_list=None,
+ corr_radius_list=None,
+ prop_radius_list=None,
+ num_reg_refine=1,
+ pred_bidir_flow=False,
+ task='flow',
+ intrinsics=None,
+ pose=None, # relative pose transform
+ min_depth=1. / 0.5, # inverse depth range
+ max_depth=1. / 10,
+ num_depth_candidates=64,
+ depth_from_argmax=False,
+ pred_bidir_depth=False,
+ **kwargs,
+ ):
+
+ if pred_bidir_flow:
+ assert task == 'flow'
+
+ if task == 'depth':
+ assert self.num_scales == 1 # multi-scale depth model is not supported yet
+
+ results_dict = {}
+ flow_preds = []
+
+ if task == 'flow':
+ # stereo and depth tasks have normalized img in dataloader
+ img0, img1 = normalize_img(img0, img1) # [B, 3, H, W]
+
+ # list of features, resolution low to high
+ feature0_list, feature1_list = self.extract_feature(img0, img1) # list of features
+
+ flow = None
+
+ if task != 'depth':
+ assert len(attn_splits_list) == len(corr_radius_list) == len(prop_radius_list) == self.num_scales
+ else:
+ assert len(attn_splits_list) == len(prop_radius_list) == self.num_scales == 1
+
+ for scale_idx in range(self.num_scales):
+ feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]
+
+ if pred_bidir_flow and scale_idx > 0:
+ # predicting bidirectional flow with refinement
+ feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
+
+ feature0_ori, feature1_ori = feature0, feature1
+
+ upsample_factor = self.upsample_factor * (2 ** (self.num_scales - 1 - scale_idx))
+
+ if task == 'depth':
+ # scale intrinsics
+ intrinsics_curr = intrinsics.clone()
+ intrinsics_curr[:, :2] = intrinsics_curr[:, :2] / upsample_factor
+
+ if scale_idx > 0:
+ assert task != 'depth' # not supported for multi-scale depth model
+ flow = F.interpolate(flow, scale_factor=2, mode='bilinear', align_corners=True) * 2
+
+ if flow is not None:
+ assert task != 'depth'
+ flow = flow.detach()
+
+ if task == 'stereo':
+ # construct flow vector for disparity
+ # flow here is actually disparity
+ zeros = torch.zeros_like(flow) # [B, 1, H, W]
+ # NOTE: reverse disp, disparity is positive
+ displace = torch.cat((-flow, zeros), dim=1) # [B, 2, H, W]
+ feature1 = flow_warp(feature1, displace) # [B, C, H, W]
+ elif task == 'flow':
+ feature1 = flow_warp(feature1, flow) # [B, C, H, W]
+ else:
+ raise NotImplementedError
+
+ attn_splits = attn_splits_list[scale_idx]
+ if task != 'depth':
+ corr_radius = corr_radius_list[scale_idx]
+ prop_radius = prop_radius_list[scale_idx]
+
+ # add position to features
+ feature0, feature1 = feature_add_position(feature0, feature1, attn_splits, self.feature_channels)
+
+ # Transformer
+ feature0, feature1 = self.transformer(feature0, feature1,
+ attn_type=attn_type,
+ attn_num_splits=attn_splits,
+ )
+
+ # correlation and softmax
+ if task == 'depth':
+ # first generate depth candidates
+ b, _, h, w = feature0.size()
+ depth_candidates = torch.linspace(min_depth, max_depth, num_depth_candidates).type_as(feature0)
+ depth_candidates = depth_candidates.view(1, num_depth_candidates, 1, 1).repeat(b, 1, h,
+ w) # [B, D, H, W]
+
+ flow_pred = correlation_softmax_depth(feature0, feature1,
+ intrinsics_curr,
+ pose,
+ depth_candidates=depth_candidates,
+ depth_from_argmax=depth_from_argmax,
+ pred_bidir_depth=pred_bidir_depth,
+ )[0]
+
+ else:
+ if corr_radius == -1: # global matching
+ if task == 'flow':
+ flow_pred = global_correlation_softmax(feature0, feature1, pred_bidir_flow)[0]
+ elif task == 'stereo':
+ flow_pred = global_correlation_softmax_stereo(feature0, feature1)[0]
+ else:
+ raise NotImplementedError
+ else: # local matching
+ if task == 'flow':
+ flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[0]
+ elif task == 'stereo':
+ flow_pred = local_correlation_softmax_stereo(feature0, feature1, corr_radius)[0]
+ else:
+ raise NotImplementedError
+
+ # flow or residual flow
+ flow = flow + flow_pred if flow is not None else flow_pred
+
+ if task == 'stereo':
+ flow = flow.clamp(min=0) # positive disparity
+
+ # upsample to the original resolution for supervison at training time only
+ if self.training:
+ flow_bilinear = self.upsample_flow(flow, None, bilinear=True, upsample_factor=upsample_factor,
+ is_depth=task == 'depth')
+ flow_preds.append(flow_bilinear)
+
+ # flow propagation with self-attn
+ if (pred_bidir_flow or pred_bidir_depth) and scale_idx == 0:
+ feature0 = torch.cat((feature0, feature1), dim=0) # [2*B, C, H, W] for propagation
+
+ flow = self.feature_flow_attn(feature0, flow.detach(),
+ local_window_attn=prop_radius > 0,
+ local_window_radius=prop_radius,
+ )
+
+ # bilinear exclude the last one
+ if self.training and scale_idx < self.num_scales - 1:
+ flow_up = self.upsample_flow(flow, feature0, bilinear=True,
+ upsample_factor=upsample_factor,
+ is_depth=task == 'depth')
+ flow_preds.append(flow_up)
+
+ if scale_idx == self.num_scales - 1:
+ if not self.reg_refine:
+ # upsample to the original image resolution
+
+ if task == 'stereo':
+ flow_pad = torch.cat((-flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W]
+ flow_up_pad = self.upsample_flow(flow_pad, feature0)
+ flow_up = -flow_up_pad[:, :1] # [B, 1, H, W]
+ elif task == 'depth':
+ depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W]
+ depth_up_pad = self.upsample_flow(depth_pad, feature0,
+ is_depth=True).clamp(min=min_depth, max=max_depth)
+ flow_up = depth_up_pad[:, :1] # [B, 1, H, W]
+ else:
+ flow_up = self.upsample_flow(flow, feature0)
+
+ flow_preds.append(flow_up)
+ else:
+ # task-specific local regression refinement
+ # supervise current flow
+ if self.training:
+ flow_up = self.upsample_flow(flow, feature0, bilinear=True,
+ upsample_factor=upsample_factor,
+ is_depth=task == 'depth')
+ flow_preds.append(flow_up)
+
+ assert num_reg_refine > 0
+ for refine_iter_idx in range(num_reg_refine):
+ flow = flow.detach()
+
+ if task == 'stereo':
+ zeros = torch.zeros_like(flow) # [B, 1, H, W]
+ # NOTE: reverse disp, disparity is positive
+ displace = torch.cat((-flow, zeros), dim=1) # [B, 2, H, W]
+ correlation = local_correlation_with_flow(
+ feature0_ori,
+ feature1_ori,
+ flow=displace,
+ local_radius=4,
+ ) # [B, (2R+1)^2, H, W]
+ elif task == 'depth':
+ if pred_bidir_depth and refine_iter_idx == 0:
+ intrinsics_curr = intrinsics_curr.repeat(2, 1, 1)
+ pose = torch.cat((pose, torch.inverse(pose)), dim=0)
+
+ feature0_ori, feature1_ori = torch.cat((feature0_ori, feature1_ori),
+ dim=0), torch.cat((feature1_ori,
+ feature0_ori), dim=0)
+
+ flow_from_depth = compute_flow_with_depth_pose(1. / flow.squeeze(1),
+ intrinsics_curr,
+ extrinsics_rel=pose,
+ )
+
+ correlation = local_correlation_with_flow(
+ feature0_ori,
+ feature1_ori,
+ flow=flow_from_depth,
+ local_radius=4,
+ ) # [B, (2R+1)^2, H, W]
+
+ else:
+ correlation = local_correlation_with_flow(
+ feature0_ori,
+ feature1_ori,
+ flow=flow,
+ local_radius=4,
+ ) # [B, (2R+1)^2, H, W]
+
+ proj = self.refine_proj(feature0)
+
+ net, inp = torch.chunk(proj, chunks=2, dim=1)
+
+ net = torch.tanh(net)
+ inp = torch.relu(inp)
+
+ net, up_mask, residual_flow = self.refine(net, inp, correlation, flow.clone(),
+ )
+
+ if task == 'depth':
+ flow = (flow - residual_flow).clamp(min=min_depth, max=max_depth)
+ else:
+ flow = flow + residual_flow
+
+ if task == 'stereo':
+ flow = flow.clamp(min=0) # positive
+
+ if self.training or refine_iter_idx == num_reg_refine - 1:
+ if task == 'depth':
+ if refine_iter_idx < num_reg_refine - 1:
+ # bilinear upsampling
+ flow_up = self.upsample_flow(flow, feature0, bilinear=True,
+ upsample_factor=upsample_factor,
+ is_depth=True)
+ else:
+ # last one convex upsampling
+ # NOTE: clamp depth due to the zero padding in the unfold in the convex upsampling
+ # pad depth to 2 channels as flow
+ depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W]
+ depth_up_pad = self.upsample_flow(depth_pad, feature0,
+ is_depth=True).clamp(min=min_depth,
+ max=max_depth)
+ flow_up = depth_up_pad[:, :1] # [B, 1, H, W]
+
+ else:
+ flow_up = upsample_flow_with_mask(flow, up_mask, upsample_factor=self.upsample_factor,
+ is_depth=task == 'depth')
+
+ flow_preds.append(flow_up)
+
+ if task == 'stereo':
+ for i in range(len(flow_preds)):
+ flow_preds[i] = flow_preds[i].squeeze(1) # [B, H, W]
+
+ # convert inverse depth to depth
+ if task == 'depth':
+ for i in range(len(flow_preds)):
+ flow_preds[i] = 1. / flow_preds[i].squeeze(1) # [B, H, W]
+
+ results_dict.update({'flow_preds': flow_preds})
+
+ return results_dict
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c3dbea403b69802cb7bf74cf4b4457bdd0b867d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/unimatch/utils.py
@@ -0,0 +1,216 @@
+import torch
+import torch.nn.functional as F
+from .position import PositionEmbeddingSine
+
+
+def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
+ assert device is not None
+
+ x, y = torch.meshgrid([torch.linspace(w_min, w_max, len_w, device=device),
+ torch.linspace(h_min, h_max, len_h, device=device)],
+ )
+ grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2]
+
+ return grid
+
+
+def normalize_coords(coords, h, w):
+ # coords: [B, H, W, 2]
+ c = torch.Tensor([(w - 1) / 2., (h - 1) / 2.]).float().to(coords.device)
+ return (coords - c) / c # [-1, 1]
+
+
+def normalize_img(img0, img1):
+ # loaded images are in [0, 255]
+ # normalize by ImageNet mean and std
+ mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device)
+ std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device)
+ img0 = (img0 / 255. - mean) / std
+ img1 = (img1 / 255. - mean) / std
+
+ return img0, img1
+
+
+def split_feature(feature,
+ num_splits=2,
+ channel_last=False,
+ ):
+ if channel_last: # [B, H, W, C]
+ b, h, w, c = feature.size()
+ assert h % num_splits == 0 and w % num_splits == 0
+
+ b_new = b * num_splits * num_splits
+ h_new = h // num_splits
+ w_new = w // num_splits
+
+ feature = feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c
+ ).permute(0, 1, 3, 2, 4, 5).reshape(b_new, h_new, w_new, c) # [B*K*K, H/K, W/K, C]
+ else: # [B, C, H, W]
+ b, c, h, w = feature.size()
+ assert h % num_splits == 0 and w % num_splits == 0
+
+ b_new = b * num_splits * num_splits
+ h_new = h // num_splits
+ w_new = w // num_splits
+
+ feature = feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits
+ ).permute(0, 2, 4, 1, 3, 5).reshape(b_new, c, h_new, w_new) # [B*K*K, C, H/K, W/K]
+
+ return feature
+
+
+def merge_splits(splits,
+ num_splits=2,
+ channel_last=False,
+ ):
+ if channel_last: # [B*K*K, H/K, W/K, C]
+ b, h, w, c = splits.size()
+ new_b = b // num_splits // num_splits
+
+ splits = splits.view(new_b, num_splits, num_splits, h, w, c)
+ merge = splits.permute(0, 1, 3, 2, 4, 5).contiguous().view(
+ new_b, num_splits * h, num_splits * w, c) # [B, H, W, C]
+ else: # [B*K*K, C, H/K, W/K]
+ b, c, h, w = splits.size()
+ new_b = b // num_splits // num_splits
+
+ splits = splits.view(new_b, num_splits, num_splits, c, h, w)
+ merge = splits.permute(0, 3, 1, 4, 2, 5).contiguous().view(
+ new_b, c, num_splits * h, num_splits * w) # [B, C, H, W]
+
+ return merge
+
+
+def generate_shift_window_attn_mask(input_resolution, window_size_h, window_size_w,
+ shift_size_h, shift_size_w, device=torch.device('cuda')):
+ # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+ # calculate attention mask for SW-MSA
+ h, w = input_resolution
+ img_mask = torch.zeros((1, h, w, 1)).to(device) # 1 H W 1
+ h_slices = (slice(0, -window_size_h),
+ slice(-window_size_h, -shift_size_h),
+ slice(-shift_size_h, None))
+ w_slices = (slice(0, -window_size_w),
+ slice(-window_size_w, -shift_size_w),
+ slice(-shift_size_w, None))
+ cnt = 0
+ for h in h_slices:
+ for w in w_slices:
+ img_mask[:, h, w, :] = cnt
+ cnt += 1
+
+ mask_windows = split_feature(img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True)
+
+ mask_windows = mask_windows.view(-1, window_size_h * window_size_w)
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+ return attn_mask
+
+
+def feature_add_position(feature0, feature1, attn_splits, feature_channels):
+ pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2)
+
+ if attn_splits > 1: # add position in splited window
+ feature0_splits = split_feature(feature0, num_splits=attn_splits)
+ feature1_splits = split_feature(feature1, num_splits=attn_splits)
+
+ position = pos_enc(feature0_splits)
+
+ feature0_splits = feature0_splits + position
+ feature1_splits = feature1_splits + position
+
+ feature0 = merge_splits(feature0_splits, num_splits=attn_splits)
+ feature1 = merge_splits(feature1_splits, num_splits=attn_splits)
+ else:
+ position = pos_enc(feature0)
+
+ feature0 = feature0 + position
+ feature1 = feature1 + position
+
+ return feature0, feature1
+
+
+def upsample_flow_with_mask(flow, up_mask, upsample_factor,
+ is_depth=False):
+ # convex upsampling following raft
+
+ mask = up_mask
+ b, flow_channel, h, w = flow.shape
+ mask = mask.view(b, 1, 9, upsample_factor, upsample_factor, h, w) # [B, 1, 9, K, K, H, W]
+ mask = torch.softmax(mask, dim=2)
+
+ multiplier = 1 if is_depth else upsample_factor
+ up_flow = F.unfold(multiplier * flow, [3, 3], padding=1)
+ up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w) # [B, 2, 9, 1, 1, H, W]
+
+ up_flow = torch.sum(mask * up_flow, dim=2) # [B, 2, K, K, H, W]
+ up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) # [B, 2, K, H, K, W]
+ up_flow = up_flow.reshape(b, flow_channel, upsample_factor * h,
+ upsample_factor * w) # [B, 2, K*H, K*W]
+
+ return up_flow
+
+
+def split_feature_1d(feature,
+ num_splits=2,
+ ):
+ # feature: [B, W, C]
+ b, w, c = feature.size()
+ assert w % num_splits == 0
+
+ b_new = b * num_splits
+ w_new = w // num_splits
+
+ feature = feature.view(b, num_splits, w // num_splits, c
+ ).view(b_new, w_new, c) # [B*K, W/K, C]
+
+ return feature
+
+
+def merge_splits_1d(splits,
+ h,
+ num_splits=2,
+ ):
+ b, w, c = splits.size()
+ new_b = b // num_splits // h
+
+ splits = splits.view(new_b, h, num_splits, w, c)
+ merge = splits.view(
+ new_b, h, num_splits * w, c) # [B, H, W, C]
+
+ return merge
+
+
+def window_partition_1d(x, window_size_w):
+ """
+ Args:
+ x: (B, W, C)
+ window_size (int): window size
+
+ Returns:
+ windows: (num_windows*B, window_size, C)
+ """
+ B, W, C = x.shape
+ x = x.view(B, W // window_size_w, window_size_w, C).view(-1, window_size_w, C)
+ return x
+
+
+def generate_shift_window_attn_mask_1d(input_w, window_size_w,
+ shift_size_w, device=torch.device('cuda')):
+ # calculate attention mask for SW-MSA
+ img_mask = torch.zeros((1, input_w, 1)).to(device) # 1 W 1
+ w_slices = (slice(0, -window_size_w),
+ slice(-window_size_w, -shift_size_w),
+ slice(-shift_size_w, None))
+ cnt = 0
+ for w in w_slices:
+ img_mask[:, w, :] = cnt
+ cnt += 1
+
+ mask_windows = window_partition_1d(img_mask, window_size_w) # nW, window_size, 1
+ mask_windows = mask_windows.view(-1, window_size_w)
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) # nW, window_size, window_size
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+ return attn_mask
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/dist_utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb08c7e6185f15f375fe56d7971593441f9ac82
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/dist_utils.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# https://github.com/open-mmlab/mmcv/blob/7540cf73ac7e5d1e14d0ffbd9b6759e83929ecfc/mmcv/runner/dist_utils.py
+
+import os
+import subprocess
+
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+
+
+def init_dist(launcher, backend='nccl', **kwargs):
+ if mp.get_start_method(allow_none=True) is None:
+ mp.set_start_method('spawn')
+ if launcher == 'pytorch':
+ _init_dist_pytorch(backend, **kwargs)
+ elif launcher == 'mpi':
+ _init_dist_mpi(backend, **kwargs)
+ elif launcher == 'slurm':
+ _init_dist_slurm(backend, **kwargs)
+ else:
+ raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, **kwargs):
+ # TODO: use local_rank instead of rank % num_gpus
+ rank = int(os.environ['RANK'])
+ num_gpus = torch.cuda.device_count()
+ torch.cuda.set_device(rank % num_gpus)
+ dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend, **kwargs):
+ # TODO: use local_rank instead of rank % num_gpus
+ rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+ num_gpus = torch.cuda.device_count()
+ torch.cuda.set_device(rank % num_gpus)
+ dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend, port=None):
+ """Initialize slurm distributed training environment.
+ If argument ``port`` is not specified, then the master port will be system
+ environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+ environment variable, then a default port ``29500`` will be used.
+ Args:
+ backend (str): Backend of torch.distributed.
+ port (int, optional): Master port. Defaults to None.
+ """
+ proc_id = int(os.environ['SLURM_PROCID'])
+ ntasks = int(os.environ['SLURM_NTASKS'])
+ node_list = os.environ['SLURM_NODELIST']
+ num_gpus = torch.cuda.device_count()
+ torch.cuda.set_device(proc_id % num_gpus)
+ addr = subprocess.getoutput(
+ f'scontrol show hostname {node_list} | head -n1')
+ # specify master port
+ if port is not None:
+ os.environ['MASTER_PORT'] = str(port)
+ elif 'MASTER_PORT' in os.environ:
+ pass # use MASTER_PORT in the environment variable
+ else:
+ # 29500 is torch.distributed default port
+ os.environ['MASTER_PORT'] = '29500'
+ # use MASTER_ADDR in the environment variable if it already exists
+ if 'MASTER_ADDR' not in os.environ:
+ os.environ['MASTER_ADDR'] = addr
+ os.environ['WORLD_SIZE'] = str(ntasks)
+ os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+ os.environ['RANK'] = str(proc_id)
+ dist.init_process_group(backend=backend)
+
+
+def get_dist_info():
+ # if (TORCH_VERSION != 'parrots'
+ # and digit_version(TORCH_VERSION) < digit_version('1.0')):
+ # initialized = dist._initialized
+ # else:
+ if dist.is_available():
+ initialized = dist.is_initialized()
+ else:
+ initialized = False
+ if initialized:
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+ else:
+ rank = 0
+ world_size = 1
+ return rank, world_size
+
+
+# from DETR repo
+def setup_for_distributed(is_master):
+ """
+ This function disables printing when not in master process
+ """
+ import builtins as __builtin__
+ builtin_print = __builtin__.print
+
+ def print(*args, **kwargs):
+ force = kwargs.pop('force', False)
+ if is_master or force:
+ builtin_print(*args, **kwargs)
+
+ __builtin__.print = print
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/file_io.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/file_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02acbde990ea98242d73fb12506613cbb1d3595
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/file_io.py
@@ -0,0 +1,224 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import re
+from PIL import Image
+import sys
+import cv2
+import json
+import os
+
+
+def read_img(filename):
+ # convert to RGB for scene flow finalpass data
+ img = np.array(Image.open(filename).convert('RGB')).astype(np.float32)
+ return img
+
+
+def read_disp(filename, subset=False, vkitti2=False, sintel=False,
+ tartanair=False, instereo2k=False, crestereo=False,
+ fallingthings=False,
+ argoverse=False,
+ raw_disp_png=False,
+ ):
+ # Scene Flow dataset
+ if filename.endswith('pfm'):
+ # For finalpass and cleanpass, gt disparity is positive, subset is negative
+ disp = np.ascontiguousarray(_read_pfm(filename)[0])
+ if subset:
+ disp = -disp
+ # VKITTI2 dataset
+ elif vkitti2:
+ disp = _read_vkitti2_disp(filename)
+ # Sintel
+ elif sintel:
+ disp = _read_sintel_disparity(filename)
+ elif tartanair:
+ disp = _read_tartanair_disp(filename)
+ elif instereo2k:
+ disp = _read_instereo2k_disp(filename)
+ elif crestereo:
+ disp = _read_crestereo_disp(filename)
+ elif fallingthings:
+ disp = _read_fallingthings_disp(filename)
+ elif argoverse:
+ disp = _read_argoverse_disp(filename)
+ elif raw_disp_png:
+ disp = np.array(Image.open(filename)).astype(np.float32)
+ # KITTI
+ elif filename.endswith('png'):
+ disp = _read_kitti_disp(filename)
+ elif filename.endswith('npy'):
+ disp = np.load(filename)
+ else:
+ raise Exception('Invalid disparity file format!')
+ return disp # [H, W]
+
+
+def _read_pfm(file):
+ file = open(file, 'rb')
+
+ color = None
+ width = None
+ height = None
+ scale = None
+ endian = None
+
+ header = file.readline().rstrip()
+ if header.decode("ascii") == 'PF':
+ color = True
+ elif header.decode("ascii") == 'Pf':
+ color = False
+ else:
+ raise Exception('Not a PFM file.')
+
+ dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode("ascii"))
+ if dim_match:
+ width, height = list(map(int, dim_match.groups()))
+ else:
+ raise Exception('Malformed PFM header.')
+
+ scale = float(file.readline().decode("ascii").rstrip())
+ if scale < 0: # little-endian
+ endian = '<'
+ scale = -scale
+ else:
+ endian = '>' # big-endian
+
+ data = np.fromfile(file, endian + 'f')
+ shape = (height, width, 3) if color else (height, width)
+
+ data = np.reshape(data, shape)
+ data = np.flipud(data)
+ return data, scale
+
+
+def write_pfm(file, image, scale=1):
+ file = open(file, 'wb')
+
+ color = None
+
+ if image.dtype.name != 'float32':
+ raise Exception('Image dtype must be float32.')
+
+ image = np.flipud(image)
+
+ if len(image.shape) == 3 and image.shape[2] == 3: # color image
+ color = True
+ elif len(image.shape) == 2 or len(
+ image.shape) == 3 and image.shape[2] == 1: # greyscale
+ color = False
+ else:
+ raise Exception(
+ 'Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+
+ file.write(b'PF\n' if color else b'Pf\n')
+ file.write(b'%d %d\n' % (image.shape[1], image.shape[0]))
+
+ endian = image.dtype.byteorder
+
+ if endian == '<' or endian == '=' and sys.byteorder == 'little':
+ scale = -scale
+
+ file.write(b'%f\n' % scale)
+
+ image.tofile(file)
+
+
+def _read_kitti_disp(filename):
+ depth = np.array(Image.open(filename))
+ depth = depth.astype(np.float32) / 256.
+ return depth
+
+
+def _read_vkitti2_disp(filename):
+ # read depth
+ depth = cv2.imread(filename, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) # in cm
+ depth = (depth / 100).astype(np.float32) # depth clipped to 655.35m for sky
+
+ valid = (depth > 0) & (depth < 655) # depth clipped to 655.35m for sky
+
+ # convert to disparity
+ focal_length = 725.0087 # in pixels
+ baseline = 0.532725 # meter
+
+ disp = baseline * focal_length / depth
+
+ disp[~valid] = 0.000001 # invalid as very small value
+
+ return disp
+
+
+def _read_sintel_disparity(filename):
+ """ Return disparity read from filename. """
+ f_in = np.array(Image.open(filename))
+
+ d_r = f_in[:, :, 0].astype('float32')
+ d_g = f_in[:, :, 1].astype('float32')
+ d_b = f_in[:, :, 2].astype('float32')
+
+ depth = d_r * 4 + d_g / (2 ** 6) + d_b / (2 ** 14)
+ return depth
+
+
+def _read_tartanair_disp(filename):
+ # the infinite distant object such as the sky has a large depth value (e.g. 10000)
+ depth = np.load(filename)
+
+ # change to disparity image
+ disparity = 80.0 / depth
+
+ return disparity
+
+
+def _read_instereo2k_disp(filename):
+ disp = np.array(Image.open(filename))
+ disp = disp.astype(np.float32) / 100.
+ return disp
+
+
+def _read_crestereo_disp(filename):
+ disp = np.array(Image.open(filename))
+ return disp.astype(np.float32) / 32.
+
+
+def _read_fallingthings_disp(filename):
+ depth = np.array(Image.open(filename))
+ camera_file = os.path.join(os.path.dirname(filename), '_camera_settings.json')
+ with open(camera_file, 'r') as f:
+ intrinsics = json.load(f)
+ fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+ disp = (fx * 6.0 * 100) / depth.astype(np.float32)
+
+ return disp
+
+
+def _read_argoverse_disp(filename):
+ disparity_map = cv2.imread(filename, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)
+ return np.float32(disparity_map) / 256.
+
+
+def extract_video(video_name):
+ cap = cv2.VideoCapture(video_name)
+ assert cap.isOpened(), f'Failed to load video file {video_name}'
+ # get video info
+ size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+ int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+ fps = cap.get(cv2.CAP_PROP_FPS)
+
+ print('video size (hxw): %dx%d' % (size[1], size[0]))
+ print('fps: %d' % fps)
+
+ imgs = []
+ while cap.isOpened():
+ # get frames
+ flag, img = cap.read()
+ if not flag:
+ break
+ # to rgb format
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+ imgs.append(img)
+
+ return imgs, fps
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/flow_viz.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/flow_viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..be27e55098768b61cf1c193ee1360251f8120488
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/flow_viz.py
@@ -0,0 +1,290 @@
+# MIT License
+#
+# Copyright (c) 2018 Tom Runia
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to conditions.
+#
+# Author: Tom Runia
+# Date Created: 2018-08-03
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from PIL import Image
+
+
+def make_colorwheel():
+ '''
+ Generates a color wheel for optical flow visualization as presented in:
+ Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+ URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+ According to the C++ source code of Daniel Scharstein
+ According to the Matlab source code of Deqing Sun
+ '''
+
+ RY = 15
+ YG = 6
+ GC = 4
+ CB = 11
+ BM = 13
+ MR = 6
+
+ ncols = RY + YG + GC + CB + BM + MR
+ colorwheel = np.zeros((ncols, 3))
+ col = 0
+
+ # RY
+ colorwheel[0:RY, 0] = 255
+ colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
+ col = col + RY
+ # YG
+ colorwheel[col:col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
+ colorwheel[col:col + YG, 1] = 255
+ col = col + YG
+ # GC
+ colorwheel[col:col + GC, 1] = 255
+ colorwheel[col:col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
+ col = col + GC
+ # CB
+ colorwheel[col:col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
+ colorwheel[col:col + CB, 2] = 255
+ col = col + CB
+ # BM
+ colorwheel[col:col + BM, 2] = 255
+ colorwheel[col:col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
+ col = col + BM
+ # MR
+ colorwheel[col:col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
+ colorwheel[col:col + MR, 0] = 255
+ return colorwheel
+
+
+def flow_compute_color(u, v, convert_to_bgr=False):
+ '''
+ Applies the flow color wheel to (possibly clipped) flow components u and v.
+ According to the C++ source code of Daniel Scharstein
+ According to the Matlab source code of Deqing Sun
+ :param u: np.ndarray, input horizontal flow
+ :param v: np.ndarray, input vertical flow
+ :param convert_to_bgr: bool, whether to change ordering and output BGR instead of RGB
+ :return:
+ '''
+
+ flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+
+ colorwheel = make_colorwheel() # shape [55x3]
+ ncols = colorwheel.shape[0]
+
+ rad = np.sqrt(np.square(u) + np.square(v))
+ a = np.arctan2(-v, -u) / np.pi
+
+ fk = (a + 1) / 2 * (ncols - 1) + 1
+ k0 = np.floor(fk).astype(np.int32)
+ k1 = k0 + 1
+ k1[k1 == ncols] = 1
+ f = fk - k0
+
+ for i in range(colorwheel.shape[1]):
+ tmp = colorwheel[:, i]
+ col0 = tmp[k0] / 255.0
+ col1 = tmp[k1] / 255.0
+ col = (1 - f) * col0 + f * col1
+
+ idx = (rad <= 1)
+ col[idx] = 1 - rad[idx] * (1 - col[idx])
+ col[~idx] = col[~idx] * 0.75 # out of range?
+
+ # Note the 2-i => BGR instead of RGB
+ ch_idx = 2 - i if convert_to_bgr else i
+ flow_image[:, :, ch_idx] = np.floor(255 * col)
+
+ return flow_image
+
+
+def flow_to_color(flow_uv, clip_flow=None, convert_to_bgr=False):
+ '''
+ Expects a two dimensional flow image of shape [H,W,2]
+ According to the C++ source code of Daniel Scharstein
+ According to the Matlab source code of Deqing Sun
+ :param flow_uv: np.ndarray of shape [H,W,2]
+ :param clip_flow: float, maximum clipping value for flow
+ :return:
+ '''
+
+ assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+ assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+
+ if clip_flow is not None:
+ flow_uv = np.clip(flow_uv, 0, clip_flow)
+
+ u = flow_uv[:, :, 0]
+ v = flow_uv[:, :, 1]
+
+ rad = np.sqrt(np.square(u) + np.square(v))
+ rad_max = np.max(rad)
+
+ epsilon = 1e-5
+ u = u / (rad_max + epsilon)
+ v = v / (rad_max + epsilon)
+
+ return flow_compute_color(u, v, convert_to_bgr)
+
+
+UNKNOWN_FLOW_THRESH = 1e7
+SMALLFLOW = 0.0
+LARGEFLOW = 1e8
+
+
+def make_color_wheel():
+ """
+ Generate color wheel according Middlebury color code
+ :return: Color wheel
+ """
+ RY = 15
+ YG = 6
+ GC = 4
+ CB = 11
+ BM = 13
+ MR = 6
+
+ ncols = RY + YG + GC + CB + BM + MR
+
+ colorwheel = np.zeros([ncols, 3])
+
+ col = 0
+
+ # RY
+ colorwheel[0:RY, 0] = 255
+ colorwheel[0:RY, 1] = np.transpose(np.floor(255 * np.arange(0, RY) / RY))
+ col += RY
+
+ # YG
+ colorwheel[col:col + YG, 0] = 255 - np.transpose(np.floor(255 * np.arange(0, YG) / YG))
+ colorwheel[col:col + YG, 1] = 255
+ col += YG
+
+ # GC
+ colorwheel[col:col + GC, 1] = 255
+ colorwheel[col:col + GC, 2] = np.transpose(np.floor(255 * np.arange(0, GC) / GC))
+ col += GC
+
+ # CB
+ colorwheel[col:col + CB, 1] = 255 - np.transpose(np.floor(255 * np.arange(0, CB) / CB))
+ colorwheel[col:col + CB, 2] = 255
+ col += CB
+
+ # BM
+ colorwheel[col:col + BM, 2] = 255
+ colorwheel[col:col + BM, 0] = np.transpose(np.floor(255 * np.arange(0, BM) / BM))
+ col += + BM
+
+ # MR
+ colorwheel[col:col + MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
+ colorwheel[col:col + MR, 0] = 255
+
+ return colorwheel
+
+
+def compute_color(u, v):
+ """
+ compute optical flow color map
+ :param u: optical flow horizontal map
+ :param v: optical flow vertical map
+ :return: optical flow in color code
+ """
+ [h, w] = u.shape
+ img = np.zeros([h, w, 3])
+ nanIdx = np.isnan(u) | np.isnan(v)
+ u[nanIdx] = 0
+ v[nanIdx] = 0
+
+ colorwheel = make_color_wheel()
+ ncols = np.size(colorwheel, 0)
+
+ rad = np.sqrt(u ** 2 + v ** 2)
+
+ a = np.arctan2(-v, -u) / np.pi
+
+ fk = (a + 1) / 2 * (ncols - 1) + 1
+
+ k0 = np.floor(fk).astype(int)
+
+ k1 = k0 + 1
+ k1[k1 == ncols + 1] = 1
+ f = fk - k0
+
+ for i in range(0, np.size(colorwheel, 1)):
+ tmp = colorwheel[:, i]
+ col0 = tmp[k0 - 1] / 255
+ col1 = tmp[k1 - 1] / 255
+ col = (1 - f) * col0 + f * col1
+
+ idx = rad <= 1
+ col[idx] = 1 - rad[idx] * (1 - col[idx])
+ notidx = np.logical_not(idx)
+
+ col[notidx] *= 0.75
+ img[:, :, i] = np.uint8(np.floor(255 * col * (1 - nanIdx)))
+
+ return img
+
+
+# from https://github.com/gengshan-y/VCN
+def flow_to_image(flow):
+ """
+ Convert flow into middlebury color code image
+ :param flow: optical flow map
+ :return: optical flow image in middlebury color
+ """
+ u = flow[:, :, 0]
+ v = flow[:, :, 1]
+
+ maxu = -999.
+ maxv = -999.
+ minu = 999.
+ minv = 999.
+
+ idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
+ u[idxUnknow] = 0
+ v[idxUnknow] = 0
+
+ maxu = max(maxu, np.max(u))
+ minu = min(minu, np.min(u))
+
+ maxv = max(maxv, np.max(v))
+ minv = min(minv, np.min(v))
+
+ rad = np.sqrt(u ** 2 + v ** 2)
+ maxrad = max(-1, np.max(rad))
+
+ u = u / (maxrad + np.finfo(float).eps)
+ v = v / (maxrad + np.finfo(float).eps)
+
+ img = compute_color(u, v)
+
+ idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+ img[idx] = 0
+
+ return np.uint8(img)
+
+
+def save_vis_flow_tofile(flow, output_path):
+ vis_flow = flow_to_image(flow)
+ Image.fromarray(vis_flow).save(output_path)
+
+
+def flow_tensor_to_image(flow):
+ """Used for tensorboard visualization"""
+ flow = flow.permute(1, 2, 0) # [H, W, 2]
+ flow = flow.detach().cpu().numpy()
+ flow = flow_to_image(flow) # [H, W, 3]
+ flow = np.transpose(flow, (2, 0, 1)) # [3, H, W]
+
+ return flow
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/frame_utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/frame_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3c2082dc2cb438317c7445b53f2ff730cae1dfc
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/frame_utils.py
@@ -0,0 +1,158 @@
+import numpy as np
+from PIL import Image
+from os.path import *
+import re
+import cv2
+
+TAG_CHAR = np.array([202021.25], np.float32)
+
+
+def readFlow(fn):
+ """ Read .flo file in Middlebury format"""
+ # Code adapted from:
+ # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
+
+ # WARNING: this will work on little-endian architectures (eg Intel x86) only!
+ # print 'fn = %s'%(fn)
+ with open(fn, 'rb') as f:
+ magic = np.fromfile(f, np.float32, count=1)
+ if 202021.25 != magic:
+ print('Magic number incorrect. Invalid .flo file')
+ return None
+ else:
+ w = np.fromfile(f, np.int32, count=1)
+ h = np.fromfile(f, np.int32, count=1)
+ # print 'Reading %d x %d flo file\n' % (w, h)
+ data = np.fromfile(f, np.float32, count=2 * int(w) * int(h))
+ # Reshape testdata into 3D array (columns, rows, bands)
+ # The reshape here is for visualization, the original code is (w,h,2)
+ return np.resize(data, (int(h), int(w), 2))
+
+
+def readPFM(file):
+ file = open(file, 'rb')
+
+ color = None
+ width = None
+ height = None
+ scale = None
+ endian = None
+
+ header = file.readline().rstrip()
+ if header == b'PF':
+ color = True
+ elif header == b'Pf':
+ color = False
+ else:
+ raise Exception('Not a PFM file.')
+
+ dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+ if dim_match:
+ width, height = map(int, dim_match.groups())
+ else:
+ raise Exception('Malformed PFM header.')
+
+ scale = float(file.readline().rstrip())
+ if scale < 0: # little-endian
+ endian = '<'
+ scale = -scale
+ else:
+ endian = '>' # big-endian
+
+ data = np.fromfile(file, endian + 'f')
+ shape = (height, width, 3) if color else (height, width)
+
+ data = np.reshape(data, shape)
+ data = np.flipud(data)
+ return data
+
+
+def writeFlow(filename, uv, v=None):
+ """ Write optical flow to file.
+
+ If v is None, uv is assumed to contain both u and v channels,
+ stacked in depth.
+ Original code by Deqing Sun, adapted from Daniel Scharstein.
+ """
+ nBands = 2
+
+ if v is None:
+ assert (uv.ndim == 3)
+ assert (uv.shape[2] == 2)
+ u = uv[:, :, 0]
+ v = uv[:, :, 1]
+ else:
+ u = uv
+
+ assert (u.shape == v.shape)
+ height, width = u.shape
+ f = open(filename, 'wb')
+ # write the header
+ f.write(TAG_CHAR)
+ np.array(width).astype(np.int32).tofile(f)
+ np.array(height).astype(np.int32).tofile(f)
+ # arrange into matrix form
+ tmp = np.zeros((height, width * nBands))
+ tmp[:, np.arange(width) * 2] = u
+ tmp[:, np.arange(width) * 2 + 1] = v
+ tmp.astype(np.float32).tofile(f)
+ f.close()
+
+
+def readFlowKITTI(filename):
+ flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+ flow = flow[:, :, ::-1].astype(np.float32)
+ flow, valid = flow[:, :, :2], flow[:, :, 2]
+ flow = (flow - 2 ** 15) / 64.0
+ return flow, valid
+
+
+def readDispKITTI(filename):
+ disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0
+ valid = disp > 0.0
+ flow = np.stack([-disp, np.zeros_like(disp)], -1)
+ return flow, valid
+
+
+def writeFlowKITTI(filename, uv):
+ uv = 64.0 * uv + 2 ** 15
+ valid = np.ones([uv.shape[0], uv.shape[1], 1])
+ uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
+ cv2.imwrite(filename, uv[..., ::-1])
+
+
+def read_gen(file_name, pil=False):
+ ext = splitext(file_name)[-1]
+ if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg':
+ return Image.open(file_name)
+ elif ext == '.bin' or ext == '.raw':
+ return np.load(file_name)
+ elif ext == '.flo':
+ return readFlow(file_name).astype(np.float32)
+ elif ext == '.pfm':
+ flow = readPFM(file_name).astype(np.float32)
+ if len(flow.shape) == 2:
+ return flow
+ else:
+ return flow[:, :, :-1]
+ return []
+
+
+def read_vkitti2_flow(filename):
+ # In R, flow along x-axis normalized by image width and quantized to [0;2^16 – 1]
+ # In G, flow along x-axis normalized by image width and quantized to [0;2^16 – 1]
+ # B = 0 for invalid flow (e.g., sky pixels)
+ bgr = cv2.imread(filename, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)
+ h, w, _c = bgr.shape
+ assert bgr.dtype == np.uint16 and _c == 3
+ # b == invalid flow flag == 0 for sky or other invalid flow
+ invalid = bgr[:, :, 0] == 0
+ # g,r == flow_y,x normalized by height,width and scaled to [0;2**16 – 1]
+ out_flow = 2.0 / (2 ** 16 - 1.0) * bgr[:, :, 2:0:-1].astype('f4') - 1 # [H, W, 2]
+ out_flow[..., 0] *= (w - 1)
+ out_flow[..., 1] *= (h - 1)
+
+ out_flow[invalid] = 0.000001 # invalid as very small value to add supervison on the sky
+ valid = (np.logical_or(invalid, ~invalid)).astype(np.float32)
+
+ return out_flow, valid
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/logger.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..527d2e928d2d65bc72a91f7bc4cb247f2a81067f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/logger.py
@@ -0,0 +1,104 @@
+import torch
+
+from utils.flow_viz import flow_tensor_to_image
+from .visualization import viz_depth_tensor
+
+
+class Logger:
+ def __init__(self, lr_scheduler,
+ summary_writer,
+ summary_freq=100,
+ start_step=0,
+ img_mean=None,
+ img_std=None,
+ ):
+ self.lr_scheduler = lr_scheduler
+ self.total_steps = start_step
+ self.running_loss = {}
+ self.summary_writer = summary_writer
+ self.summary_freq = summary_freq
+
+ self.img_mean = img_mean
+ self.img_std = img_std
+
+ def print_training_status(self, mode='train', is_depth=False):
+ if is_depth:
+ print('step: %06d \t loss: %.3f' % (self.total_steps, self.running_loss['total_loss'] / self.summary_freq))
+ else:
+ print('step: %06d \t epe: %.3f' % (self.total_steps, self.running_loss['epe'] / self.summary_freq))
+
+ for k in self.running_loss:
+ self.summary_writer.add_scalar(mode + '/' + k,
+ self.running_loss[k] / self.summary_freq, self.total_steps)
+ self.running_loss[k] = 0.0
+
+ def lr_summary(self):
+ lr = self.lr_scheduler.get_last_lr()[0]
+ self.summary_writer.add_scalar('lr', lr, self.total_steps)
+
+ def add_image_summary(self, img1, img2, flow_preds=None, flow_gt=None, mode='train',
+ is_depth=False,
+ ):
+ if self.total_steps % self.summary_freq == 0:
+ if is_depth:
+ img1 = self.unnormalize_image(img1.detach().cpu()) # [3, H, W], range [0, 1]
+ img2 = self.unnormalize_image(img2.detach().cpu())
+
+ concat = torch.cat((img1, img2), dim=-1) # [3, H, W*2]
+
+ self.summary_writer.add_image(mode + '/img', concat, self.total_steps)
+ else:
+ img_concat = torch.cat((img1[0].detach().cpu(), img2[0].detach().cpu()), dim=-1)
+ img_concat = img_concat.type(torch.uint8) # convert to uint8 to visualize in tensorboard
+
+ flow_pred = flow_tensor_to_image(flow_preds[-1][0])
+ forward_flow_gt = flow_tensor_to_image(flow_gt[0])
+ flow_concat = torch.cat((torch.from_numpy(flow_pred),
+ torch.from_numpy(forward_flow_gt)), dim=-1)
+
+ concat = torch.cat((img_concat, flow_concat), dim=-2)
+
+ self.summary_writer.add_image(mode + '/img_pred_gt', concat, self.total_steps)
+
+ def add_depth_summary(self, depth_pred, depth_gt, mode='train'):
+ # assert depth_pred.dim() == 2 # [H, W]
+ if self.total_steps % self.summary_freq == 0 or 'val' in mode:
+ pred_viz = viz_depth_tensor(depth_pred.detach().cpu()) # [3, H, W]
+ gt_viz = viz_depth_tensor(depth_gt.detach().cpu())
+
+ concat = torch.cat((pred_viz, gt_viz), dim=-1) # [3, H, W*2]
+
+ self.summary_writer.add_image(mode + '/depth_pred_gt', concat, self.total_steps)
+
+ def unnormalize_image(self, img):
+ # img: [3, H, W], used for visualizing image
+ mean = torch.tensor(self.img_mean).view(3, 1, 1).type_as(img)
+ std = torch.tensor(self.img_std).view(3, 1, 1).type_as(img)
+
+ out = img * std + mean
+
+ return out
+
+ def push(self, metrics, mode='train', is_depth=False, ):
+ self.total_steps += 1
+
+ self.lr_summary()
+
+ for key in metrics:
+ if key not in self.running_loss:
+ self.running_loss[key] = 0.0
+
+ self.running_loss[key] += metrics[key]
+
+ if self.total_steps % self.summary_freq == 0:
+ self.print_training_status(mode, is_depth=is_depth)
+ self.running_loss = {}
+
+ def write_dict(self, results):
+ for key in results:
+ tag = key.split('_')[0]
+ tag = tag + '/' + key
+ self.summary_writer.add_scalar(tag, results[key], self.total_steps)
+
+ def close(self):
+ self.summary_writer.close()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/misc.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..601f52d5ee40f31a0114c23fb38ad4983d8a3372
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/misc.py
@@ -0,0 +1,36 @@
+import os
+import sys
+import json
+
+
+def read_text_lines(filepath):
+ with open(filepath, 'r') as f:
+ lines = f.readlines()
+ lines = [l.rstrip() for l in lines]
+ return lines
+
+
+def check_path(path):
+ if not os.path.exists(path):
+ os.makedirs(path, exist_ok=True) # explicitly set exist_ok when multi-processing
+
+
+def save_command(save_path, filename='command_train.txt'):
+ check_path(save_path)
+ command = sys.argv
+ save_file = os.path.join(save_path, filename)
+ # Save all training commands when resuming training
+ with open(save_file, 'a') as f:
+ f.write(' '.join(command))
+ f.write('\n\n')
+
+
+def save_args(args, filename='args.json'):
+ args_dict = vars(args)
+ check_path(args.checkpoint_dir)
+ save_path = os.path.join(args.checkpoint_dir, filename)
+
+ # save all training args when resuming training
+ with open(save_path, 'a') as f:
+ json.dump(args_dict, f, indent=4, sort_keys=False)
+ f.write('\n\n')
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..187af4003dc34a7548c07779112b105cb182ff2f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/utils.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+
+class InputPadder:
+ """ Pads images such that dimensions are divisible by 8 """
+
+ def __init__(self, dims, mode='sintel', padding_factor=8):
+ self.ht, self.wd = dims[-2:]
+ pad_ht = (((self.ht // padding_factor) + 1) * padding_factor - self.ht) % padding_factor
+ pad_wd = (((self.wd // padding_factor) + 1) * padding_factor - self.wd) % padding_factor
+ if mode == 'sintel':
+ self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
+ else:
+ self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+ def pad(self, *inputs):
+ return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+
+ def unpad(self, x):
+ ht, wd = x.shape[-2:]
+ c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+ return x[..., c[0]:c[1], c[2]:c[3]]
+
+
+def bilinear_sampler(img, coords, mode='bilinear', mask=False, padding_mode='zeros'):
+ """ Wrapper for grid_sample, uses pixel coordinates """
+ if coords.size(-1) != 2: # [B, 2, H, W] -> [B, H, W, 2]
+ coords = coords.permute(0, 2, 3, 1)
+
+ H, W = img.shape[-2:]
+ # H = height if height is not None else img.shape[-2]
+ # W = width if width is not None else img.shape[-1]
+
+ xgrid, ygrid = coords.split([1, 1], dim=-1)
+
+ # To handle H or W equals to 1 by explicitly defining height and width
+ if H == 1:
+ assert ygrid.abs().max() < 1e-8
+ H = 10
+ if W == 1:
+ assert xgrid.abs().max() < 1e-8
+ W = 10
+
+ xgrid = 2 * xgrid / (W - 1) - 1
+ ygrid = 2 * ygrid / (H - 1) - 1
+
+ grid = torch.cat([xgrid, ygrid], dim=-1)
+ img = F.grid_sample(img, grid, mode=mode,
+ padding_mode=padding_mode,
+ align_corners=True)
+
+ if mask:
+ mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+ return img, mask.squeeze(-1).float()
+
+ return img
+
+
+def coords_grid(batch, ht, wd, normalize=False):
+ if normalize: # [-1, 1]
+ coords = torch.meshgrid(2 * torch.arange(ht) / (ht - 1) - 1,
+ 2 * torch.arange(wd) / (wd - 1) - 1)
+ else:
+ coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+ coords = torch.stack(coords[::-1], dim=0).float()
+ return coords[None].repeat(batch, 1, 1, 1) # [B, 2, H, W]
+
+
+def coords_grid_np(h, w): # used for accumulating high speed sintel flow testdata
+ coords = np.meshgrid(np.arange(h, dtype=np.float32),
+ np.arange(w, dtype=np.float32), indexing='ij')
+ coords = np.stack(coords[::-1], axis=-1) # [H, W, 2]
+
+ return coords
+
+
+def compute_out_of_boundary_mask(flow, downsample_factor=None):
+ # flow: [B, 2, H, W]
+ assert flow.dim() == 4 and flow.size(1) == 2
+ b, _, h, w = flow.shape
+ init_coords = coords_grid(b, h, w).to(flow.device)
+ corres = init_coords + flow # [B, 2, H, W]
+
+ if downsample_factor is not None:
+ assert w % downsample_factor == 0 and h % downsample_factor == 0
+ # the actual max disp can predict is in the downsampled feature resolution, then upsample
+ max_w = (w // downsample_factor - 1) * downsample_factor
+ max_h = (h // downsample_factor - 1) * downsample_factor
+ # print('max_w: %d, max_h: %d' % (max_w, max_h))
+ else:
+ max_w = w - 1
+ max_h = h - 1
+
+ valid_mask = (corres[:, 0] >= 0) & (corres[:, 0] <= max_w) & (corres[:, 1] >= 0) & (corres[:, 1] <= max_h)
+
+ # in case very large flow
+ flow_mask = (flow[:, 0].abs() <= max_w) & (flow[:, 1].abs() <= max_h)
+
+ valid_mask = valid_mask & flow_mask
+
+ return valid_mask # [B, H, W]
+
+
+def normalize_coords(grid):
+ """Normalize coordinates of image scale to [-1, 1]
+ Args:
+ grid: [B, 2, H, W]
+ """
+ assert grid.size(1) == 2
+ h, w = grid.size()[2:]
+ grid[:, 0, :, :] = 2 * (grid[:, 0, :, :].clone() / (w - 1)) - 1 # x: [-1, 1]
+ grid[:, 1, :, :] = 2 * (grid[:, 1, :, :].clone() / (h - 1)) - 1 # y: [-1, 1]
+ # grid = grid.permute((0, 2, 3, 1)) # [B, H, W, 2]
+ return grid
+
+
+def flow_warp(feature, flow, mask=False, padding_mode='zeros'):
+ b, c, h, w = feature.size()
+ assert flow.size(1) == 2
+
+ grid = coords_grid(b, h, w).to(flow.device) + flow # [B, 2, H, W]
+
+ return bilinear_sampler(feature, grid, mask=mask, padding_mode=padding_mode)
+
+
+def upflow8(flow, mode='bilinear'):
+ new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+ return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
+
+
+def bilinear_upflow(flow, scale_factor=8):
+ assert flow.size(1) == 2
+ flow = F.interpolate(flow, scale_factor=scale_factor,
+ mode='bilinear', align_corners=True) * scale_factor
+
+ return flow
+
+
+def upsample_flow(flow, img):
+ if flow.size(-1) != img.size(-1):
+ scale_factor = img.size(-1) / flow.size(-1)
+ flow = F.interpolate(flow, size=img.size()[-2:],
+ mode='bilinear', align_corners=True) * scale_factor
+ return flow
+
+
+def count_parameters(model):
+ num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ return num
+
+
+def set_bn_eval(m):
+ classname = m.__class__.__name__
+ if classname.find('BatchNorm') != -1:
+ m.eval()
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/visualization.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..157c5f2641a80d55fa39bd6ae8331a63a49d41c3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/unimatch/utils/visualization.py
@@ -0,0 +1,107 @@
+import torch
+import torch.utils.data
+import numpy as np
+import torchvision.utils as vutils
+import cv2
+from matplotlib.cm import get_cmap
+import matplotlib as mpl
+import matplotlib.cm as cm
+
+
+def vis_disparity(disp):
+ disp_vis = (disp - disp.min()) / (disp.max() - disp.min()) * 255.0
+ disp_vis = disp_vis.astype("uint8")
+ disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO)
+
+ return disp_vis
+
+
+def gen_error_colormap():
+ cols = np.array(
+ [[0 / 3.0, 0.1875 / 3.0, 49, 54, 149],
+ [0.1875 / 3.0, 0.375 / 3.0, 69, 117, 180],
+ [0.375 / 3.0, 0.75 / 3.0, 116, 173, 209],
+ [0.75 / 3.0, 1.5 / 3.0, 171, 217, 233],
+ [1.5 / 3.0, 3 / 3.0, 224, 243, 248],
+ [3 / 3.0, 6 / 3.0, 254, 224, 144],
+ [6 / 3.0, 12 / 3.0, 253, 174, 97],
+ [12 / 3.0, 24 / 3.0, 244, 109, 67],
+ [24 / 3.0, 48 / 3.0, 215, 48, 39],
+ [48 / 3.0, np.inf, 165, 0, 38]], dtype=np.float32)
+ cols[:, 2: 5] /= 255.
+ return cols
+
+
+def disp_error_img(D_est_tensor, D_gt_tensor, abs_thres=3., rel_thres=0.05, dilate_radius=1):
+ D_gt_np = D_gt_tensor.detach().cpu().numpy()
+ D_est_np = D_est_tensor.detach().cpu().numpy()
+ B, H, W = D_gt_np.shape
+ # valid mask
+ mask = D_gt_np > 0
+ # error in percentage. When error <= 1, the pixel is valid since <= 3px & 5%
+ error = np.abs(D_gt_np - D_est_np)
+ error[np.logical_not(mask)] = 0
+ error[mask] = np.minimum(error[mask] / abs_thres, (error[mask] / D_gt_np[mask]) / rel_thres)
+ # get colormap
+ cols = gen_error_colormap()
+ # create error image
+ error_image = np.zeros([B, H, W, 3], dtype=np.float32)
+ for i in range(cols.shape[0]):
+ error_image[np.logical_and(error >= cols[i][0], error < cols[i][1])] = cols[i, 2:]
+ # TODO: imdilate
+ # error_image = cv2.imdilate(D_err, strel('disk', dilate_radius));
+ error_image[np.logical_not(mask)] = 0.
+ # show color tag in the top-left cornor of the image
+ for i in range(cols.shape[0]):
+ distance = 20
+ error_image[:, :10, i * distance:(i + 1) * distance, :] = cols[i, 2:]
+
+ return torch.from_numpy(np.ascontiguousarray(error_image.transpose([0, 3, 1, 2])))
+
+
+def save_images(logger, mode_tag, images_dict, global_step):
+ images_dict = tensor2numpy(images_dict)
+ for tag, values in images_dict.items():
+ if not isinstance(values, list) and not isinstance(values, tuple):
+ values = [values]
+ for idx, value in enumerate(values):
+ if len(value.shape) == 3:
+ value = value[:, np.newaxis, :, :]
+ value = value[:1]
+ value = torch.from_numpy(value)
+
+ image_name = '{}/{}'.format(mode_tag, tag)
+ if len(values) > 1:
+ image_name = image_name + "_" + str(idx)
+ logger.add_image(image_name, vutils.make_grid(value, padding=0, nrow=1, normalize=True, scale_each=True),
+ global_step)
+
+
+def tensor2numpy(var_dict):
+ for key, vars in var_dict.items():
+ if isinstance(vars, np.ndarray):
+ var_dict[key] = vars
+ elif isinstance(vars, torch.Tensor):
+ var_dict[key] = vars.data.cpu().numpy()
+ else:
+ raise NotImplementedError("invalid input type for tensor2numpy")
+
+ return var_dict
+
+
+def viz_depth_tensor(disp, return_numpy=False, colormap='plasma'):
+ # visualize inverse depth
+ assert isinstance(disp, torch.Tensor)
+
+ disp = disp.numpy()
+ vmax = np.percentile(disp, 95)
+ normalizer = mpl.colors.Normalize(vmin=disp.min(), vmax=vmax)
+ mapper = cm.ScalarMappable(norm=normalizer, cmap=colormap)
+ colormapped_im = (mapper.to_rgba(disp)[:, :, :3] * 255).astype(np.uint8) # [H, W, 3]
+
+ if return_numpy:
+ return colormapped_im
+
+ viz = torch.from_numpy(colormapped_im).permute(2, 0, 1) # [3, H, W]
+
+ return viz
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/util.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a40d1736bf0495db1a531b762cd56dee40f1c6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/util.py
@@ -0,0 +1,352 @@
+import os
+import random
+import tempfile
+import warnings
+from contextlib import suppress
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+from huggingface_hub import constants, hf_hub_download
+from torch.hub import get_dir, download_url_to_file
+from ast import literal_eval
+
+
+TORCHHUB_PATH = Path(__file__).parent / 'depth_anything' / 'torchhub'
+HF_MODEL_NAME = "lllyasviel/Annotators"
+DWPOSE_MODEL_NAME = "yzd-v/DWPose"
+BDS_MODEL_NAME = "bdsqlsz/qinglong_controlnet-lllite"
+DENSEPOSE_MODEL_NAME = "LayerNorm/DensePose-TorchScript-with-hint-image"
+MESH_GRAPHORMER_MODEL_NAME = "hr16/ControlNet-HandRefiner-pruned"
+SAM_MODEL_NAME = "dhkim2810/MobileSAM"
+UNIMATCH_MODEL_NAME = "hr16/Unimatch"
+DEPTH_ANYTHING_MODEL_NAME = "LiheYoung/Depth-Anything" #HF Space
+DIFFUSION_EDGE_MODEL_NAME = "hr16/Diffusion-Edge"
+METRIC3D_MODEL_NAME = "JUGGHM/Metric3D"
+
+DEPTH_ANYTHING_V2_MODEL_NAME_DICT = {
+ "depth_anything_v2_vits.pth": "depth-anything/Depth-Anything-V2-Small",
+ "depth_anything_v2_vitb.pth": "depth-anything/Depth-Anything-V2-Base",
+ "depth_anything_v2_vitl.pth": "depth-anything/Depth-Anything-V2-Large",
+ "depth_anything_v2_vitg.pth": "depth-anything/Depth-Anything-V2-Giant",
+ "depth_anything_v2_metric_vkitti_vitl.pth": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large",
+ "depth_anything_v2_metric_hypersim_vitl.pth": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large"
+}
+
+temp_dir = tempfile.gettempdir()
+annotator_ckpts_path = os.path.join(Path(__file__).parents[2], 'ckpts')
+USE_SYMLINKS = False
+
+try:
+ annotator_ckpts_path = os.environ['AUX_ANNOTATOR_CKPTS_PATH']
+except:
+ warnings.warn("Custom pressesor model path not set successfully.")
+ pass
+
+try:
+ USE_SYMLINKS = literal_eval(os.environ['AUX_USE_SYMLINKS'])
+except:
+ warnings.warn("USE_SYMLINKS not set successfully. Using default value: False to download models.")
+ pass
+
+try:
+ temp_dir = os.environ['AUX_TEMP_DIR']
+ if len(temp_dir) >= 60:
+ warnings.warn(f"custom temp dir is too long. Using default")
+ temp_dir = tempfile.gettempdir()
+except:
+ warnings.warn(f"custom temp dir not set successfully")
+ pass
+
+here = Path(__file__).parent.resolve()
+
+def HWC3(x):
+ assert x.dtype == np.uint8
+ if x.ndim == 2:
+ x = x[:, :, None]
+ assert x.ndim == 3
+ H, W, C = x.shape
+ assert C == 1 or C == 3 or C == 4
+ if C == 3:
+ return x
+ if C == 1:
+ return np.concatenate([x, x, x], axis=2)
+ if C == 4:
+ color = x[:, :, 0:3].astype(np.float32)
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+ y = color * alpha + 255.0 * (1.0 - alpha)
+ y = y.clip(0, 255).astype(np.uint8)
+ return y
+
+
+def make_noise_disk(H, W, C, F, rng=None):
+ if rng:
+ noise = rng.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
+ else:
+ noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
+ noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
+ noise = noise[F: F + H, F: F + W]
+ noise -= np.min(noise)
+ noise /= np.max(noise)
+ if C == 1:
+ noise = noise[:, :, None]
+ return noise
+
+
+def nms(x, t, s):
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
+
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+
+ y = np.zeros_like(x)
+
+ for f in [f1, f2, f3, f4]:
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+
+ z = np.zeros_like(y, dtype=np.uint8)
+ z[y > t] = 255
+ return z
+
+def min_max_norm(x):
+ x -= np.min(x)
+ x /= np.maximum(np.max(x), 1e-5)
+ return x
+
+
+def safe_step(x, step=2):
+ y = x.astype(np.float32) * float(step + 1)
+ y = y.astype(np.int32).astype(np.float32) / float(step)
+ return y
+
+
+def img2mask(img, H, W, low=10, high=90):
+ assert img.ndim == 3 or img.ndim == 2
+ assert img.dtype == np.uint8
+
+ if img.ndim == 3:
+ y = img[:, :, random.randrange(0, img.shape[2])]
+ else:
+ y = img
+
+ y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
+
+ if random.uniform(0, 1) < 0.5:
+ y = 255 - y
+
+ return y < np.percentile(y, random.randrange(low, high))
+
+def safer_memory(x):
+ # Fix many MAC/AMD problems
+ return np.ascontiguousarray(x.copy()).copy()
+
+UPSCALE_METHODS = ["INTER_NEAREST", "INTER_LINEAR", "INTER_AREA", "INTER_CUBIC", "INTER_LANCZOS4"]
+def get_upscale_method(method_str):
+ assert method_str in UPSCALE_METHODS, f"Method {method_str} not found in {UPSCALE_METHODS}"
+ return getattr(cv2, method_str)
+
+def pad64(x):
+ return int(np.ceil(float(x) / 64.0) * 64 - x)
+
+#https://github.com/Mikubill/sd-webui-controlnet/blob/main/scripts/processor.py#L17
+#Added upscale_method, mode params
+def resize_image_with_pad(input_image, resolution, upscale_method = "", skip_hwc3=False, mode='edge'):
+ if skip_hwc3:
+ img = input_image
+ else:
+ img = HWC3(input_image)
+ H_raw, W_raw, _ = img.shape
+ if resolution == 0:
+ return img, lambda x: x
+ k = float(resolution) / float(min(H_raw, W_raw))
+ H_target = int(np.round(float(H_raw) * k))
+ W_target = int(np.round(float(W_raw) * k))
+ img = cv2.resize(img, (W_target, H_target), interpolation=get_upscale_method(upscale_method) if k > 1 else cv2.INTER_AREA)
+ H_pad, W_pad = pad64(H_target), pad64(W_target)
+ img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode)
+
+ def remove_pad(x):
+ return safer_memory(x[:H_target, :W_target, ...])
+
+ return safer_memory(img_padded), remove_pad
+
+def common_input_validate(input_image, output_type, **kwargs):
+ if "img" in kwargs:
+ warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
+ input_image = kwargs.pop("img")
+
+ if "return_pil" in kwargs:
+ warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
+ output_type = "pil" if kwargs["return_pil"] else "np"
+
+ if type(output_type) is bool:
+ warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
+ if output_type:
+ output_type = "pil"
+
+ if input_image is None:
+ raise ValueError("input_image must be defined.")
+
+ if not isinstance(input_image, np.ndarray):
+ input_image = np.array(input_image, dtype=np.uint8)
+ output_type = output_type or "pil"
+ else:
+ output_type = output_type or "np"
+
+ return (input_image, output_type)
+
+def torch_gc():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.ipc_collect()
+
+
+def ade_palette():
+ """ADE20K palette that maps each class to RGB values."""
+ return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+ [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+ [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+ [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+ [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+ [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+ [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+ [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+ [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+ [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+ [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+ [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+ [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+ [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+ [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+ [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+ [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+ [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+ [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+ [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+ [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+ [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+ [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+ [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+ [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+ [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+ [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+ [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+ [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+ [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+ [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+ [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+ [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+ [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+ [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+ [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+ [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+ [102, 255, 0], [92, 0, 255]]
+
+#https://stackoverflow.com/a/44873382
+#Assume that the minimum version of Python ppl use is 3.9
+def sha256sum(file_path):
+ import hashlib
+ h = hashlib.sha256()
+ b = bytearray(128*1024)
+ mv = memoryview(b)
+ with open(file_path, 'rb', buffering=0) as f:
+ while n := f.readinto(mv):
+ h.update(mv[:n])
+ return h.hexdigest()
+
+def check_hash_from_torch_hub(file_path, filename):
+ basename, _ = filename.split('.')
+ _, ref_hash = basename.split('-')
+ curr_hash = sha256sum(file_path)
+ return curr_hash[:len(ref_hash)] == ref_hash
+
+def custom_torch_download(filename, ckpts_dir=annotator_ckpts_path):
+ local_dir = os.path.join(get_dir(), 'checkpoints')
+ model_path = os.path.join(local_dir, filename)
+
+ if not os.path.exists(model_path):
+ print(f"Failed to find {model_path}.\n Downloading from pytorch.org")
+ local_dir = os.path.join(ckpts_dir, "torch")
+ if not os.path.exists(local_dir):
+ os.mkdir(local_dir)
+
+ model_path = os.path.join(local_dir, filename)
+
+ if not os.path.exists(model_path):
+ model_url = "https://download.pytorch.org/models/"+filename
+ try:
+ download_url_to_file(url = model_url, dst = model_path)
+ except:
+ warnings.warn(f"SSL verify failed, try use HTTP instead. {filename}'s hash will be checked")
+ download_url_to_file(url = model_url, dst = model_path)
+ assert check_hash_from_torch_hub(model_path, filename), f"Hash check failed as file {filename} is corrupted"
+ print("Hash check passed")
+
+ print(f"model_path is {model_path}")
+ return model_path
+
+def custom_hf_download(pretrained_model_or_path, filename, cache_dir=temp_dir, ckpts_dir=annotator_ckpts_path, subfolder='', use_symlinks=USE_SYMLINKS, repo_type="model"):
+
+ local_dir = os.path.join(ckpts_dir, pretrained_model_or_path)
+ model_path = os.path.join(local_dir, *subfolder.split('/'), filename)
+
+ if len(str(model_path)) >= 255:
+ warnings.warn(f"Path {model_path} is too long, \n please change annotator_ckpts_path in config.yaml")
+
+ if not os.path.exists(model_path):
+ print(f"Failed to find {model_path}.\n Downloading from huggingface.co")
+ print(f"cacher folder is {cache_dir}, you can change it by custom_tmp_path in config.yaml")
+ if use_symlinks:
+ cache_dir_d = constants.HF_HUB_CACHE # use huggingface newer env variables `HF_HUB_CACHE`
+ if cache_dir_d is None:
+ import platform
+ if platform.system() == "Windows":
+ cache_dir_d = os.path.join(os.getenv("USERPROFILE"), ".cache", "huggingface", "hub")
+ else:
+ cache_dir_d = os.path.join(os.getenv("HOME"), ".cache", "huggingface", "hub")
+ try:
+ # test_link
+ Path(cache_dir_d).mkdir(parents=True, exist_ok=True)
+ Path(ckpts_dir).mkdir(parents=True, exist_ok=True)
+ (Path(cache_dir_d) / f"linktest_{filename}.txt").touch()
+ # symlink instead of link avoid `invalid cross-device link` error.
+ os.symlink(os.path.join(cache_dir_d, f"linktest_{filename}.txt"), os.path.join(ckpts_dir, f"linktest_{filename}.txt"))
+ print("Using symlinks to download models. \n",\
+ "Make sure you have enough space on your cache folder. \n",\
+ "And do not purge the cache folder after downloading.\n",\
+ "Otherwise, you will have to re-download the models every time you run the script.\n",\
+ "You can use USE_SYMLINKS: False in config.yaml to avoid this behavior.")
+ except:
+ print("Maybe not able to create symlink. Disable using symlinks.")
+ use_symlinks = False
+ cache_dir_d = os.path.join(cache_dir, "ckpts", pretrained_model_or_path)
+ finally: # always remove test link files
+ with suppress(FileNotFoundError):
+ os.remove(os.path.join(ckpts_dir, f"linktest_{filename}.txt"))
+ os.remove(os.path.join(cache_dir_d, f"linktest_{filename}.txt"))
+ else:
+ cache_dir_d = os.path.join(cache_dir, "ckpts", pretrained_model_or_path)
+
+ model_path = hf_hub_download(repo_id=pretrained_model_or_path,
+ cache_dir=cache_dir_d,
+ local_dir=local_dir,
+ subfolder=subfolder,
+ filename=filename,
+ local_dir_use_symlinks=use_symlinks,
+ resume_download=True,
+ etag_timeout=100,
+ repo_type=repo_type
+ )
+ if not use_symlinks:
+ try:
+ import shutil
+ shutil.rmtree(os.path.join(cache_dir, "ckpts"))
+ except Exception as e :
+ print(e)
+
+ print(f"model_path is {model_path}")
+
+ return model_path
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/LICENSE b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..5dc42cbd843dbce4af5fe089fc97dbff9d404f57
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Intelligent Systems Lab Org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..62dd48c8ee8340b4db13ad4010da9992ea10962a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/__init__.py
@@ -0,0 +1,111 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+
+from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, HF_MODEL_NAME, DEPTH_ANYTHING_MODEL_NAME
+from .zoedepth.models.zoedepth.zoedepth_v1 import ZoeDepth
+from .zoedepth.models.zoedepth_anything.zoedepth_v1 import ZoeDepth as ZoeDepthAnything
+from .zoedepth.utils.config import get_config
+
+
+class ZoeDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=HF_MODEL_NAME, filename="ZoeD_M12_N.pt"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename)
+
+ conf = get_config("zoedepth", "infer")
+ model = ZoeDepth.build_from_config(conf)
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['model'])
+ model.eval()
+
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ image_depth = input_image
+ with torch.no_grad():
+ image_depth = torch.from_numpy(image_depth).float().to(self.device)
+ image_depth = image_depth / 255.0
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
+ depth = self.model.infer(image_depth)
+
+ depth = depth[0, 0].cpu().numpy()
+
+ vmin = np.percentile(depth, 2)
+ vmax = np.percentile(depth, 85)
+
+ depth -= vmin
+ depth /= vmax - vmin
+ depth = 1.0 - depth
+ depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = remove_pad(HWC3(depth_image))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
+
+class ZoeDepthAnythingDetector:
+ def __init__(self, model):
+ self.model = model
+ self.device = "cpu"
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_or_path=DEPTH_ANYTHING_MODEL_NAME, filename="depth_anything_metric_depth_indoor.pt"):
+ model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder="checkpoints_metric_depth", repo_type="space")
+
+ conf = get_config("zoedepth", "infer")
+ model = ZoeDepthAnything.build_from_config(conf)
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['model'])
+ model.eval()
+
+ return cls(model)
+
+ def to(self, device):
+ self.model.to(device)
+ self.device = device
+ return self
+
+ def __call__(self, input_image, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+ input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+ input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+
+ image_depth = input_image
+ with torch.no_grad():
+ image_depth = torch.from_numpy(image_depth).float().to(self.device)
+ image_depth = image_depth / 255.0
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
+ depth = self.model.infer(image_depth)
+
+ depth = depth[0, 0].cpu().numpy()
+
+ vmin = np.percentile(depth, 2)
+ vmax = np.percentile(depth, 85)
+
+ depth -= vmin
+ depth /= vmax - vmin
+ depth = 1.0 - depth
+ depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
+
+ detected_map = remove_pad(HWC3(depth_image))
+
+ if output_type == "pil":
+ detected_map = Image.fromarray(detected_map)
+
+ return detected_map
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae1a1e4e86d9a5b14586cd006ed43d2bbc9b4a6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae1a1e4e86d9a5b14586cd006ed43d2bbc9b4a6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/depth_anything.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/depth_anything.py
new file mode 100644
index 0000000000000000000000000000000000000000..9553f5bf896066d8cfbd68dc20f037b1f226c4c2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/depth_anything.py
@@ -0,0 +1,377 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+import numpy as np
+from torchvision.transforms import Normalize
+from .dpt_dinov2.dpt import DPT_DINOv2
+from custom_controlnet_aux.util import custom_hf_download, DEPTH_ANYTHING_MODEL_NAME
+
+
+def denormalize(x):
+ """Reverses the imagenet normalization applied to the input.
+
+ Args:
+ x (torch.Tensor - shape(N,3,H,W)): input tensor
+
+ Returns:
+ torch.Tensor - shape(N,3,H,W): Denormalized input
+ """
+ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
+ std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
+ return x * std + mean
+
+def get_activation(name, bank):
+ def hook(model, input, output):
+ bank[name] = output
+ return hook
+
+
+class Resize(object):
+ """Resize sample to given size (width, height).
+ """
+
+ def __init__(
+ self,
+ width,
+ height,
+ resize_target=True,
+ keep_aspect_ratio=False,
+ ensure_multiple_of=1,
+ resize_method="lower_bound",
+ ):
+ """Init.
+ Args:
+ width (int): desired output width
+ height (int): desired output height
+ resize_target (bool, optional):
+ True: Resize the full sample (image, mask, target).
+ False: Resize image only.
+ Defaults to True.
+ keep_aspect_ratio (bool, optional):
+ True: Keep the aspect ratio of the input sample.
+ Output sample might not have the given width and height, and
+ resize behaviour depends on the parameter 'resize_method'.
+ Defaults to False.
+ ensure_multiple_of (int, optional):
+ Output width and height is constrained to be multiple of this parameter.
+ Defaults to 1.
+ resize_method (str, optional):
+ "lower_bound": Output will be at least as large as the given size.
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
+ Defaults to "lower_bound".
+ """
+ print("Params passed to Resize transform:")
+ print("\twidth: ", width)
+ print("\theight: ", height)
+ print("\tresize_target: ", resize_target)
+ print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
+ print("\tensure_multiple_of: ", ensure_multiple_of)
+ print("\tresize_method: ", resize_method)
+
+ self.__width = width
+ self.__height = height
+
+ self.__keep_aspect_ratio = keep_aspect_ratio
+ self.__multiple_of = ensure_multiple_of
+ self.__resize_method = resize_method
+
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ if max_val is not None and y > max_val:
+ y = (np.floor(x / self.__multiple_of)
+ * self.__multiple_of).astype(int)
+
+ if y < min_val:
+ y = (np.ceil(x / self.__multiple_of)
+ * self.__multiple_of).astype(int)
+
+ return y
+
+ def get_size(self, width, height):
+ # determine new height and width
+ scale_height = self.__height / height
+ scale_width = self.__width / width
+
+ if self.__keep_aspect_ratio:
+ if self.__resize_method == "lower_bound":
+ # scale such that output size is lower bound
+ if scale_width > scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "upper_bound":
+ # scale such that output size is upper bound
+ if scale_width < scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "minimal":
+ # scale as least as possbile
+ if abs(1 - scale_width) < abs(1 - scale_height):
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ else:
+ raise ValueError(
+ f"resize_method {self.__resize_method} not implemented"
+ )
+
+ if self.__resize_method == "lower_bound":
+ new_height = self.constrain_to_multiple_of(
+ scale_height * height, min_val=self.__height
+ )
+ new_width = self.constrain_to_multiple_of(
+ scale_width * width, min_val=self.__width
+ )
+ elif self.__resize_method == "upper_bound":
+ new_height = self.constrain_to_multiple_of(
+ scale_height * height, max_val=self.__height
+ )
+ new_width = self.constrain_to_multiple_of(
+ scale_width * width, max_val=self.__width
+ )
+ elif self.__resize_method == "minimal":
+ new_height = self.constrain_to_multiple_of(scale_height * height)
+ new_width = self.constrain_to_multiple_of(scale_width * width)
+ else:
+ raise ValueError(
+ f"resize_method {self.__resize_method} not implemented")
+
+ return (new_width, new_height)
+
+ def __call__(self, x):
+ width, height = self.get_size(*x.shape[-2:][::-1])
+ return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True)
+
+class PrepForMidas(object):
+ def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
+ if isinstance(img_size, int):
+ img_size = (img_size, img_size)
+ net_h, net_w = img_size
+ # self.normalization = Normalize(
+ # mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+ self.normalization = Normalize(
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+ self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=14, resize_method=resize_mode) \
+ if do_resize else nn.Identity()
+
+ def __call__(self, x):
+ return self.normalization(self.resizer(x))
+
+
+class DepthAnythingCore(nn.Module):
+ def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True,
+ img_size=384, **kwargs):
+ """Midas Base model used for multi-scale feature extraction.
+
+ Args:
+ midas (torch.nn.Module): Midas model.
+ trainable (bool, optional): Train midas model. Defaults to False.
+ fetch_features (bool, optional): Extract multi-scale features. Defaults to True.
+ layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1').
+ freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False.
+ keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True.
+ img_size (int, tuple, optional): Input resolution. Defaults to 384.
+ """
+ super().__init__()
+ self.core = midas
+ self.output_channels = None
+ self.core_out = {}
+ self.trainable = trainable
+ self.fetch_features = fetch_features
+ # midas.scratch.output_conv = nn.Identity()
+ self.handles = []
+ # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1']
+ self.layer_names = layer_names
+
+ self.set_trainable(trainable)
+ self.set_fetch_features(fetch_features)
+
+ self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio,
+ img_size=img_size, do_resize=kwargs.get('do_resize', True))
+
+ if freeze_bn:
+ self.freeze_bn()
+
+ def set_trainable(self, trainable):
+ self.trainable = trainable
+ if trainable:
+ self.unfreeze()
+ else:
+ self.freeze()
+ return self
+
+ def set_fetch_features(self, fetch_features):
+ self.fetch_features = fetch_features
+ if fetch_features:
+ if len(self.handles) == 0:
+ self.attach_hooks(self.core)
+ else:
+ self.remove_hooks()
+ return self
+
+ def freeze(self):
+ for p in self.parameters():
+ p.requires_grad = False
+ self.trainable = False
+ return self
+
+ def unfreeze(self):
+ for p in self.parameters():
+ p.requires_grad = True
+ self.trainable = True
+ return self
+
+ def freeze_bn(self):
+ for m in self.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eval()
+ return self
+
+ def forward(self, x, denorm=False, return_rel_depth=False):
+ # print('input to midas:', x.shape)
+ with torch.no_grad():
+ if denorm:
+ x = denormalize(x)
+ x = self.prep(x)
+
+ with torch.set_grad_enabled(self.trainable):
+
+ rel_depth = self.core(x)
+ if not self.fetch_features:
+ return rel_depth
+ out = [self.core_out[k] for k in self.layer_names]
+
+ if return_rel_depth:
+ return rel_depth, out
+ return out
+
+ def get_rel_pos_params(self):
+ for name, p in self.core.pretrained.named_parameters():
+ if "pos_embed" in name:
+ yield p
+
+ def get_enc_params_except_rel_pos(self):
+ for name, p in self.core.pretrained.named_parameters():
+ if "pos_embed" not in name:
+ yield p
+
+ def freeze_encoder(self, freeze_rel_pos=False):
+ if freeze_rel_pos:
+ for p in self.core.pretrained.parameters():
+ p.requires_grad = False
+ else:
+ for p in self.get_enc_params_except_rel_pos():
+ p.requires_grad = False
+ return self
+
+ def attach_hooks(self, midas):
+ if len(self.handles) > 0:
+ self.remove_hooks()
+ if "out_conv" in self.layer_names:
+ self.handles.append(list(midas.depth_head.scratch.output_conv2.children())[
+ 1].register_forward_hook(get_activation("out_conv", self.core_out)))
+ if "r4" in self.layer_names:
+ self.handles.append(midas.depth_head.scratch.refinenet4.register_forward_hook(
+ get_activation("r4", self.core_out)))
+ if "r3" in self.layer_names:
+ self.handles.append(midas.depth_head.scratch.refinenet3.register_forward_hook(
+ get_activation("r3", self.core_out)))
+ if "r2" in self.layer_names:
+ self.handles.append(midas.depth_head.scratch.refinenet2.register_forward_hook(
+ get_activation("r2", self.core_out)))
+ if "r1" in self.layer_names:
+ self.handles.append(midas.depth_head.scratch.refinenet1.register_forward_hook(
+ get_activation("r1", self.core_out)))
+ if "l4_rn" in self.layer_names:
+ self.handles.append(midas.depth_head.scratch.layer4_rn.register_forward_hook(
+ get_activation("l4_rn", self.core_out)))
+
+ return self
+
+ def remove_hooks(self):
+ for h in self.handles:
+ h.remove()
+ return self
+
+ def __del__(self):
+ self.remove_hooks()
+
+ def set_output_channels(self):
+ self.output_channels = [256, 256, 256, 256, 256]
+
+ @staticmethod
+ def build(midas_model_type="dinov2_large", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs):
+ if "img_size" in kwargs:
+ kwargs = DepthAnythingCore.parse_img_size(kwargs)
+ img_size = kwargs.pop("img_size", [384, 384])
+
+ depth_anything = DPT_DINOv2(out_channels=[256, 512, 1024, 1024], use_clstoken=False)
+ depth_anything_path = custom_hf_download(DEPTH_ANYTHING_MODEL_NAME, "depth_anything_vitl14.pth", subfolder="checkpoints", repo_type="space")
+ state_dict = torch.load(depth_anything_path, map_location='cpu')
+ depth_anything.load_state_dict(state_dict)
+
+ kwargs.update({'keep_aspect_ratio': force_keep_ar})
+
+ depth_anything_core = DepthAnythingCore(depth_anything, trainable=train_midas, fetch_features=fetch_features,
+ freeze_bn=freeze_bn, img_size=img_size, **kwargs)
+
+ depth_anything_core.set_output_channels()
+ return depth_anything_core
+
+ @staticmethod
+ def parse_img_size(config):
+ assert 'img_size' in config
+ if isinstance(config['img_size'], str):
+ assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W"
+ config['img_size'] = list(map(int, config['img_size'].split(",")))
+ assert len(
+ config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W"
+ elif isinstance(config['img_size'], int):
+ config['img_size'] = [config['img_size'], config['img_size']]
+ else:
+ assert isinstance(config['img_size'], list) and len(
+ config['img_size']) == 2, "img_size should be a list of H,W"
+ return config
+
+
+nchannels2models = {
+ tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"],
+ (512, 256, 128, 64, 64): ["MiDaS_small"]
+}
+
+# Model name to number of output channels
+MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items()
+ for m in v
+ }
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/dpt_dinov2/blocks.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/dpt_dinov2/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc72ed8ac82129e69444f237982754638e84f60a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/dpt_dinov2/blocks.py
@@ -0,0 +1,153 @@
+import torch.nn as nn
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+ scratch = nn.Module()
+
+ out_shape1 = out_shape
+ out_shape2 = out_shape
+ out_shape3 = out_shape
+ if len(in_shape) >= 4:
+ out_shape4 = out_shape
+
+ if expand:
+ out_shape1 = out_shape
+ out_shape2 = out_shape*2
+ out_shape3 = out_shape*4
+ if len(in_shape) >= 4:
+ out_shape4 = out_shape*8
+
+ scratch.layer1_rn = nn.Conv2d(
+ in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+ scratch.layer2_rn = nn.Conv2d(
+ in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+ scratch.layer3_rn = nn.Conv2d(
+ in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+ if len(in_shape) >= 4:
+ scratch.layer4_rn = nn.Conv2d(
+ in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+ )
+
+ return scratch
+
+
+class ResidualConvUnit(nn.Module):
+ """Residual convolution module.
+ """
+
+ def __init__(self, features, activation, bn):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.bn = bn
+
+ self.groups=1
+
+ self.conv1 = nn.Conv2d(
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+ )
+
+ self.conv2 = nn.Conv2d(
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+ )
+
+ if self.bn==True:
+ self.bn1 = nn.BatchNorm2d(features)
+ self.bn2 = nn.BatchNorm2d(features)
+
+ self.activation = activation
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ """Forward pass.
+
+ Args:
+ x (tensor): input
+
+ Returns:
+ tensor: output
+ """
+
+ out = self.activation(x)
+ out = self.conv1(out)
+ if self.bn==True:
+ out = self.bn1(out)
+
+ out = self.activation(out)
+ out = self.conv2(out)
+ if self.bn==True:
+ out = self.bn2(out)
+
+ if self.groups > 1:
+ out = self.conv_merge(out)
+
+ return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock(nn.Module):
+ """Feature fusion block.
+ """
+
+ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock, self).__init__()
+
+ self.deconv = deconv
+ self.align_corners = align_corners
+
+ self.groups=1
+
+ self.expand = expand
+ out_features = features
+ if self.expand==True:
+ out_features = features//2
+
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+
+ self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+ self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+
+ self.skip_add = nn.quantized.FloatFunctional()
+
+ self.size=size
+
+ def forward(self, *xs, size=None):
+ """Forward pass.
+
+ Returns:
+ tensor: output
+ """
+ output = xs[0]
+
+ if len(xs) == 2:
+ res = self.resConfUnit1(xs[1])
+ output = self.skip_add.add(output, res)
+
+ output = self.resConfUnit2(output)
+
+ if (size is None) and (self.size is None):
+ modifier = {"scale_factor": 2}
+ elif size is None:
+ modifier = {"size": self.size}
+ else:
+ modifier = {"size": size}
+
+ output = nn.functional.interpolate(
+ output, **modifier, mode="bilinear", align_corners=self.align_corners
+ )
+
+ output = self.out_conv(output)
+
+ return output
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/dpt_dinov2/dpt.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/dpt_dinov2/dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0e8b7ccbc71e81b1360c2407ab31fed1f91fa8a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/dpt_dinov2/dpt.py
@@ -0,0 +1,158 @@
+import torch
+import torch.nn as nn
+
+from .blocks import FeatureFusionBlock, _make_scratch
+import torch.nn.functional as F
+from custom_controlnet_aux.util import TORCHHUB_PATH
+
+
+def _make_fusion_block(features, use_bn, size = None):
+ return FeatureFusionBlock(
+ features,
+ nn.ReLU(False),
+ deconv=False,
+ bn=use_bn,
+ expand=False,
+ align_corners=True,
+ size=size,
+ )
+
+
+class DPTHead(nn.Module):
+ def __init__(self, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False):
+ super(DPTHead, self).__init__()
+
+ self.use_clstoken = use_clstoken
+
+ # out_channels = [in_channels // 8, in_channels // 4, in_channels // 2, in_channels]
+ # out_channels = [in_channels // 4, in_channels // 2, in_channels, in_channels]
+ # out_channels = [in_channels, in_channels, in_channels, in_channels]
+
+ self.projects = nn.ModuleList([
+ nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_channel,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ) for out_channel in out_channels
+ ])
+
+ self.resize_layers = nn.ModuleList([
+ nn.ConvTranspose2d(
+ in_channels=out_channels[0],
+ out_channels=out_channels[0],
+ kernel_size=4,
+ stride=4,
+ padding=0),
+ nn.ConvTranspose2d(
+ in_channels=out_channels[1],
+ out_channels=out_channels[1],
+ kernel_size=2,
+ stride=2,
+ padding=0),
+ nn.Identity(),
+ nn.Conv2d(
+ in_channels=out_channels[3],
+ out_channels=out_channels[3],
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ ])
+
+ if use_clstoken:
+ self.readout_projects = nn.ModuleList()
+ for _ in range(len(self.projects)):
+ self.readout_projects.append(
+ nn.Sequential(
+ nn.Linear(2 * in_channels, in_channels),
+ nn.GELU()))
+
+ self.scratch = _make_scratch(
+ out_channels,
+ features,
+ groups=1,
+ expand=False,
+ )
+
+ self.scratch.stem_transpose = None
+
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+ head_features_1 = features
+ head_features_2 = 32
+
+ self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+
+ self.scratch.output_conv2 = nn.Sequential(
+ nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(True),
+ nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+ nn.ReLU(True),
+ nn.Identity(),
+ )
+
+ def forward(self, out_features, patch_h, patch_w):
+ out = []
+ for i, x in enumerate(out_features):
+ if self.use_clstoken:
+ x, cls_token = x[0], x[1]
+ readout = cls_token.unsqueeze(1).expand_as(x)
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
+ else:
+ x = x[0]
+
+ x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+
+ x = self.projects[i](x)
+ x = self.resize_layers[i](x)
+
+ out.append(x)
+
+ layer_1, layer_2, layer_3, layer_4 = out
+
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+ path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+ out = self.scratch.output_conv1(path_1)
+ out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+ out = self.scratch.output_conv2(out)
+
+ return out
+
+
+class DPT_DINOv2(nn.Module):
+ def __init__(self, encoder='vitl', features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False):
+
+ super(DPT_DINOv2, self).__init__()
+
+ torch.manual_seed(1)
+
+ self.pretrained = torch.hub.load(TORCHHUB_PATH / 'facebookresearch_dinov2_main', 'dinov2_{:}14'.format(encoder), source='local', pretrained=False)
+
+ dim = self.pretrained.blocks[0].attn.qkv.in_features
+
+ self.depth_head = DPTHead(dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+
+ def forward(self, x):
+ h, w = x.shape[-2:]
+
+ features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True)
+
+ patch_h, patch_w = h // 14, w // 14
+
+ depth = self.depth_head(features, patch_h, patch_w)
+ depth = F.interpolate(depth, size=(h, w), mode="bilinear", align_corners=True)
+ depth = F.relu(depth)
+
+ return depth.squeeze(1)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/midas.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/midas.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed406b08d61088f6f8d04cd30be15f69becc73a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/base_models/midas.py
@@ -0,0 +1,383 @@
+# MIT License
+import os
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+import numpy as np
+from torchvision.transforms import Normalize
+import inspect
+from pathlib import Path
+
+
+def denormalize(x):
+ """Reverses the imagenet normalization applied to the input.
+
+ Args:
+ x (torch.Tensor - shape(N,3,H,W)): input tensor
+
+ Returns:
+ torch.Tensor - shape(N,3,H,W): Denormalized input
+ """
+ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
+ std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
+ return x * std + mean
+
+def get_activation(name, bank):
+ def hook(model, input, output):
+ bank[name] = output
+ return hook
+
+
+class Resize(object):
+ """Resize sample to given size (width, height).
+ """
+
+ def __init__(
+ self,
+ width,
+ height,
+ resize_target=True,
+ keep_aspect_ratio=False,
+ ensure_multiple_of=1,
+ resize_method="lower_bound",
+ ):
+ """Init.
+ Args:
+ width (int): desired output width
+ height (int): desired output height
+ resize_target (bool, optional):
+ True: Resize the full sample (image, mask, target).
+ False: Resize image only.
+ Defaults to True.
+ keep_aspect_ratio (bool, optional):
+ True: Keep the aspect ratio of the input sample.
+ Output sample might not have the given width and height, and
+ resize behaviour depends on the parameter 'resize_method'.
+ Defaults to False.
+ ensure_multiple_of (int, optional):
+ Output width and height is constrained to be multiple of this parameter.
+ Defaults to 1.
+ resize_method (str, optional):
+ "lower_bound": Output will be at least as large as the given size.
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
+ Defaults to "lower_bound".
+ """
+ # print("Params passed to Resize transform:")
+ # print("\twidth: ", width)
+ # print("\theight: ", height)
+ # print("\tresize_target: ", resize_target)
+ # print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
+ # print("\tensure_multiple_of: ", ensure_multiple_of)
+ # print("\tresize_method: ", resize_method)
+
+ self.__width = width
+ self.__height = height
+
+ self.__keep_aspect_ratio = keep_aspect_ratio
+ self.__multiple_of = ensure_multiple_of
+ self.__resize_method = resize_method
+
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+ if max_val is not None and y > max_val:
+ y = (np.floor(x / self.__multiple_of)
+ * self.__multiple_of).astype(int)
+
+ if y < min_val:
+ y = (np.ceil(x / self.__multiple_of)
+ * self.__multiple_of).astype(int)
+
+ return y
+
+ def get_size(self, width, height):
+ # determine new height and width
+ scale_height = self.__height / height
+ scale_width = self.__width / width
+
+ if self.__keep_aspect_ratio:
+ if self.__resize_method == "lower_bound":
+ # scale such that output size is lower bound
+ if scale_width > scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "upper_bound":
+ # scale such that output size is upper bound
+ if scale_width < scale_height:
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ elif self.__resize_method == "minimal":
+ # scale as least as possbile
+ if abs(1 - scale_width) < abs(1 - scale_height):
+ # fit width
+ scale_height = scale_width
+ else:
+ # fit height
+ scale_width = scale_height
+ else:
+ raise ValueError(
+ f"resize_method {self.__resize_method} not implemented"
+ )
+
+ if self.__resize_method == "lower_bound":
+ new_height = self.constrain_to_multiple_of(
+ scale_height * height, min_val=self.__height
+ )
+ new_width = self.constrain_to_multiple_of(
+ scale_width * width, min_val=self.__width
+ )
+ elif self.__resize_method == "upper_bound":
+ new_height = self.constrain_to_multiple_of(
+ scale_height * height, max_val=self.__height
+ )
+ new_width = self.constrain_to_multiple_of(
+ scale_width * width, max_val=self.__width
+ )
+ elif self.__resize_method == "minimal":
+ new_height = self.constrain_to_multiple_of(scale_height * height)
+ new_width = self.constrain_to_multiple_of(scale_width * width)
+ else:
+ raise ValueError(
+ f"resize_method {self.__resize_method} not implemented")
+
+ return (new_width, new_height)
+
+ def __call__(self, x):
+ width, height = self.get_size(*x.shape[-2:][::-1])
+ return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True)
+
+class PrepForMidas(object):
+ def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
+ if isinstance(img_size, int):
+ img_size = (img_size, img_size)
+ net_h, net_w = img_size
+ self.normalization = Normalize(
+ mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+ self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \
+ if do_resize else nn.Identity()
+
+ def __call__(self, x):
+ return self.normalization(self.resizer(x))
+
+
+class MidasCore(nn.Module):
+ def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True,
+ img_size=384, **kwargs):
+ """Midas Base model used for multi-scale feature extraction.
+
+ Args:
+ midas (torch.nn.Module): Midas model.
+ trainable (bool, optional): Train midas model. Defaults to False.
+ fetch_features (bool, optional): Extract multi-scale features. Defaults to True.
+ layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1').
+ freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False.
+ keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True.
+ img_size (int, tuple, optional): Input resolution. Defaults to 384.
+ """
+ super().__init__()
+ self.core = midas
+ self.output_channels = None
+ self.core_out = {}
+ self.trainable = trainable
+ self.fetch_features = fetch_features
+ # midas.scratch.output_conv = nn.Identity()
+ self.handles = []
+ # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1']
+ self.layer_names = layer_names
+
+ self.set_trainable(trainable)
+ self.set_fetch_features(fetch_features)
+
+ self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio,
+ img_size=img_size, do_resize=kwargs.get('do_resize', True))
+
+ if freeze_bn:
+ self.freeze_bn()
+
+ def set_trainable(self, trainable):
+ self.trainable = trainable
+ if trainable:
+ self.unfreeze()
+ else:
+ self.freeze()
+ return self
+
+ def set_fetch_features(self, fetch_features):
+ self.fetch_features = fetch_features
+ if fetch_features:
+ if len(self.handles) == 0:
+ self.attach_hooks(self.core)
+ else:
+ self.remove_hooks()
+ return self
+
+ def freeze(self):
+ for p in self.parameters():
+ p.requires_grad = False
+ self.trainable = False
+ return self
+
+ def unfreeze(self):
+ for p in self.parameters():
+ p.requires_grad = True
+ self.trainable = True
+ return self
+
+ def freeze_bn(self):
+ for m in self.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eval()
+ return self
+
+ def forward(self, x, denorm=False, return_rel_depth=False):
+ with torch.no_grad():
+ if denorm:
+ x = denormalize(x)
+ x = self.prep(x)
+ # print("Shape after prep: ", x.shape)
+
+ with torch.set_grad_enabled(self.trainable):
+
+ # print("Input size to Midascore", x.shape)
+ rel_depth = self.core(x)
+ # print("Output from custom_midas_repo.midas shape", rel_depth.shape)
+ if not self.fetch_features:
+ return rel_depth
+ out = [self.core_out[k] for k in self.layer_names]
+
+ if return_rel_depth:
+ return rel_depth, out
+ return out
+
+ def get_rel_pos_params(self):
+ for name, p in self.core.pretrained.named_parameters():
+ if "relative_position" in name:
+ yield p
+
+ def get_enc_params_except_rel_pos(self):
+ for name, p in self.core.pretrained.named_parameters():
+ if "relative_position" not in name:
+ yield p
+
+ def freeze_encoder(self, freeze_rel_pos=False):
+ if freeze_rel_pos:
+ for p in self.core.pretrained.parameters():
+ p.requires_grad = False
+ else:
+ for p in self.get_enc_params_except_rel_pos():
+ p.requires_grad = False
+ return self
+
+ def attach_hooks(self, midas):
+ if len(self.handles) > 0:
+ self.remove_hooks()
+ if "out_conv" in self.layer_names:
+ self.handles.append(list(midas.scratch.output_conv.children())[
+ 3].register_forward_hook(get_activation("out_conv", self.core_out)))
+ if "r4" in self.layer_names:
+ self.handles.append(midas.scratch.refinenet4.register_forward_hook(
+ get_activation("r4", self.core_out)))
+ if "r3" in self.layer_names:
+ self.handles.append(midas.scratch.refinenet3.register_forward_hook(
+ get_activation("r3", self.core_out)))
+ if "r2" in self.layer_names:
+ self.handles.append(midas.scratch.refinenet2.register_forward_hook(
+ get_activation("r2", self.core_out)))
+ if "r1" in self.layer_names:
+ self.handles.append(midas.scratch.refinenet1.register_forward_hook(
+ get_activation("r1", self.core_out)))
+ if "l4_rn" in self.layer_names:
+ self.handles.append(midas.scratch.layer4_rn.register_forward_hook(
+ get_activation("l4_rn", self.core_out)))
+
+ return self
+
+ def remove_hooks(self):
+ for h in self.handles:
+ h.remove()
+ return self
+
+ def __del__(self):
+ self.remove_hooks()
+
+ def set_output_channels(self, model_type):
+ self.output_channels = MIDAS_SETTINGS[model_type]
+
+ @staticmethod
+ def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs):
+ if midas_model_type not in MIDAS_SETTINGS:
+ raise ValueError(
+ f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}")
+ if "img_size" in kwargs:
+ kwargs = MidasCore.parse_img_size(kwargs)
+ img_size = kwargs.pop("img_size", [384, 384])
+ # print("img_size", img_size)
+ import custom_midas_repo
+ midas_path = Path(inspect.getfile(custom_midas_repo)).parent.resolve()
+ del custom_midas_repo
+ midas = torch.hub.load(midas_path, midas_model_type,
+ pretrained=use_pretrained_midas, force_reload=force_reload, source='local')
+ kwargs.update({'keep_aspect_ratio': force_keep_ar})
+ midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features,
+ freeze_bn=freeze_bn, img_size=img_size, **kwargs)
+ midas_core.set_output_channels(midas_model_type)
+ return midas_core
+
+ @staticmethod
+ def build_from_config(config):
+ return MidasCore.build(**config)
+
+ @staticmethod
+ def parse_img_size(config):
+ assert 'img_size' in config
+ if isinstance(config['img_size'], str):
+ assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W"
+ config['img_size'] = list(map(int, config['img_size'].split(",")))
+ assert len(
+ config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W"
+ elif isinstance(config['img_size'], int):
+ config['img_size'] = [config['img_size'], config['img_size']]
+ else:
+ assert isinstance(config['img_size'], list) and len(
+ config['img_size']) == 2, "img_size should be a list of H,W"
+ return config
+
+
+nchannels2models = {
+ tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"],
+ (512, 256, 128, 64, 64): ["MiDaS_small"]
+}
+
+# Model name to number of output channels
+MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items()
+ for m in v
+ }
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/builder.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c6f7bb6fa39af071b4d6ca267b53084b3ee5f6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/builder.py
@@ -0,0 +1,51 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+from importlib import import_module
+from .depth_model import DepthModel
+
+def build_model(config) -> DepthModel:
+ """Builds a model from a config. The model is specified by the model name and version in the config. The model is then constructed using the build_from_config function of the model interface.
+ This function should be used to construct models for training and evaluation.
+
+ Args:
+ config (dict): Config dict. Config is constructed in utils/config.py. Each model has its own config file(s) saved in its root model folder.
+
+ Returns:
+ torch.nn.Module: Model corresponding to name and version as specified in config
+ """
+ module_name = f"zoedepth.models.{config.model}"
+ try:
+ module = import_module(module_name)
+ except ModuleNotFoundError as e:
+ # print the original error message
+ print(e)
+ raise ValueError(
+ f"Model {config.model} not found. Refer above error for details.") from e
+ try:
+ get_version = getattr(module, "get_version")
+ except AttributeError as e:
+ raise ValueError(
+ f"Model {config.model} has no get_version function.") from e
+ return get_version(config.version_name).build_from_config(config)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/depth_model.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/depth_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4301a4f65755a011f5384b59fa4d931cd977948
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/depth_model.py
@@ -0,0 +1,152 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+import PIL.Image
+from PIL import Image
+from typing import Union
+
+
+class DepthModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.device = 'cpu'
+
+ def to(self, device) -> nn.Module:
+ self.device = device
+ return super().to(device)
+
+ def forward(self, x, *args, **kwargs):
+ raise NotImplementedError
+
+ def _infer(self, x: torch.Tensor):
+ """
+ Inference interface for the model
+ Args:
+ x (torch.Tensor): input tensor of shape (b, c, h, w)
+ Returns:
+ torch.Tensor: output tensor of shape (b, 1, h, w)
+ """
+ return self(x)['metric_depth']
+
+ def _infer_with_pad_aug(self, x: torch.Tensor, pad_input: bool=True, fh: float=3, fw: float=3, upsampling_mode: str='bicubic', padding_mode="reflect", **kwargs) -> torch.Tensor:
+ """
+ Inference interface for the model with padding augmentation
+ Padding augmentation fixes the boundary artifacts in the output depth map.
+ Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset which has a black or white border around the image.
+ This augmentation pads the input image and crops the prediction back to the original size / view.
+
+ Note: This augmentation is not required for the models trained with 'avoid_boundary'=True.
+ Args:
+ x (torch.Tensor): input tensor of shape (b, c, h, w)
+ pad_input (bool, optional): whether to pad the input or not. Defaults to True.
+ fh (float, optional): height padding factor. The padding is calculated as sqrt(h/2) * fh. Defaults to 3.
+ fw (float, optional): width padding factor. The padding is calculated as sqrt(w/2) * fw. Defaults to 3.
+ upsampling_mode (str, optional): upsampling mode. Defaults to 'bicubic'.
+ padding_mode (str, optional): padding mode. Defaults to "reflect".
+ Returns:
+ torch.Tensor: output tensor of shape (b, 1, h, w)
+ """
+ # assert x is nchw and c = 3
+ assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim())
+ assert x.shape[1] == 3, "x must have 3 channels, got {}".format(x.shape[1])
+
+ if pad_input:
+ assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0"
+ pad_h = int(np.sqrt(x.shape[2]/2) * fh)
+ pad_w = int(np.sqrt(x.shape[3]/2) * fw)
+ padding = [pad_w, pad_w]
+ if pad_h > 0:
+ padding += [pad_h, pad_h]
+
+ x = F.pad(x, padding, mode=padding_mode, **kwargs)
+ out = self._infer(x)
+ if out.shape[-2:] != x.shape[-2:]:
+ out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False)
+ if pad_input:
+ # crop to the original size, handling the case where pad_h and pad_w is 0
+ if pad_h > 0:
+ out = out[:, :, pad_h:-pad_h,:]
+ if pad_w > 0:
+ out = out[:, :, :, pad_w:-pad_w]
+ return out
+
+ def infer_with_flip_aug(self, x, pad_input: bool=True, **kwargs) -> torch.Tensor:
+ """
+ Inference interface for the model with horizontal flip augmentation
+ Horizontal flip augmentation improves the accuracy of the model by averaging the output of the model with and without horizontal flip.
+ Args:
+ x (torch.Tensor): input tensor of shape (b, c, h, w)
+ pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
+ Returns:
+ torch.Tensor: output tensor of shape (b, 1, h, w)
+ """
+ # infer with horizontal flip and average
+ out = self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
+ out_flip = self._infer_with_pad_aug(torch.flip(x, dims=[3]), pad_input=pad_input, **kwargs)
+ out = (out + torch.flip(out_flip, dims=[3])) / 2
+ return out
+
+ def infer(self, x, pad_input: bool=True, with_flip_aug: bool=True, **kwargs) -> torch.Tensor:
+ """
+ Inference interface for the model
+ Args:
+ x (torch.Tensor): input tensor of shape (b, c, h, w)
+ pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
+ with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
+ Returns:
+ torch.Tensor: output tensor of shape (b, 1, h, w)
+ """
+ if with_flip_aug:
+ return self.infer_with_flip_aug(x, pad_input=pad_input, **kwargs)
+ else:
+ return self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
+
+ @torch.no_grad()
+ def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, output_type: str="numpy", **kwargs) -> Union[np.ndarray, PIL.Image.Image, torch.Tensor]:
+ """
+ Inference interface for the model for PIL image
+ Args:
+ pil_img (PIL.Image.Image): input PIL image
+ pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
+ with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
+ output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy".
+ """
+ x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device)
+ out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs)
+ if output_type == "numpy":
+ return out_tensor.squeeze().cpu().numpy()
+ elif output_type == "pil":
+ # uint16 is required for depth pil image
+ out_16bit_numpy = (out_tensor.squeeze().cpu().numpy()*256).astype(np.uint16)
+ return Image.fromarray(out_16bit_numpy)
+ elif output_type == "tensor":
+ return out_tensor.squeeze().cpu()
+ else:
+ raise ValueError(f"output_type {output_type} not supported. Supported values are 'numpy', 'pil' and 'tensor'")
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e793398b6d348429b7035e1bbac3e070dcb5b9c2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/__init__.py
@@ -0,0 +1,23 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/attractor.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/attractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e3473ca6e2271dc28666314cf8f92f52f7e3c6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/attractor.py
@@ -0,0 +1,208 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+
+
+@torch.jit.script
+def exp_attractor(dx, alpha: float = 300, gamma: int = 2):
+ """Exponential attractor: dc = exp(-alpha*|dx|^gamma) * dx , where dx = a - c, a = attractor point, c = bin center, dc = shift in bin centermmary for exp_attractor
+
+ Args:
+ dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
+ alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300.
+ gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2.
+
+ Returns:
+ torch.Tensor : Delta shifts - dc; New bin centers = Old bin centers + dc
+ """
+ return torch.exp(-alpha*(torch.abs(dx)**gamma)) * (dx)
+
+
+@torch.jit.script
+def inv_attractor(dx, alpha: float = 300, gamma: int = 2):
+ """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
+ This is the default one according to the accompanying paper.
+
+ Args:
+ dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
+ alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300.
+ gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2.
+
+ Returns:
+ torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
+ """
+ return dx.div(1+alpha*dx.pow(gamma))
+
+
+class AttractorLayer(nn.Module):
+ def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10,
+ alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False):
+ """
+ Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
+ """
+ super().__init__()
+
+ self.n_attractors = n_attractors
+ self.n_bins = n_bins
+ self.min_depth = min_depth
+ self.max_depth = max_depth
+ self.alpha = alpha
+ self.gamma = gamma
+ self.kind = kind
+ self.attractor_type = attractor_type
+ self.memory_efficient = memory_efficient
+
+ self._net = nn.Sequential(
+ nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(mlp_dim, n_attractors*2, 1, 1, 0), # x2 for linear norm
+ nn.ReLU(inplace=True)
+ )
+
+ def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
+ """
+ Args:
+ x (torch.Tensor) : feature block; shape - n, c, h, w
+ b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w
+
+ Returns:
+ tuple(torch.Tensor,torch.Tensor) : new bin centers normed and scaled; shape - n, nbins, h, w
+ """
+ if prev_b_embedding is not None:
+ if interpolate:
+ prev_b_embedding = nn.functional.interpolate(
+ prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
+ x = x + prev_b_embedding
+
+ A = self._net(x)
+ eps = 1e-3
+ A = A + eps
+ n, c, h, w = A.shape
+ A = A.view(n, self.n_attractors, 2, h, w)
+ A_normed = A / A.sum(dim=2, keepdim=True) # n, a, 2, h, w
+ A_normed = A[:, :, 0, ...] # n, na, h, w
+
+ b_prev = nn.functional.interpolate(
+ b_prev, (h, w), mode='bilinear', align_corners=True)
+ b_centers = b_prev
+
+ if self.attractor_type == 'exp':
+ dist = exp_attractor
+ else:
+ dist = inv_attractor
+
+ if not self.memory_efficient:
+ func = {'mean': torch.mean, 'sum': torch.sum}[self.kind]
+ # .shape N, nbins, h, w
+ delta_c = func(dist(A_normed.unsqueeze(
+ 2) - b_centers.unsqueeze(1)), dim=1)
+ else:
+ delta_c = torch.zeros_like(b_centers, device=b_centers.device)
+ for i in range(self.n_attractors):
+ # .shape N, nbins, h, w
+ delta_c += dist(A_normed[:, i, ...].unsqueeze(1) - b_centers)
+
+ if self.kind == 'mean':
+ delta_c = delta_c / self.n_attractors
+
+ b_new_centers = b_centers + delta_c
+ B_centers = (self.max_depth - self.min_depth) * \
+ b_new_centers + self.min_depth
+ B_centers, _ = torch.sort(B_centers, dim=1)
+ B_centers = torch.clip(B_centers, self.min_depth, self.max_depth)
+ return b_new_centers, B_centers
+
+
+class AttractorLayerUnnormed(nn.Module):
+ def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10,
+ alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False):
+ """
+ Attractor layer for bin centers. Bin centers are unbounded
+ """
+ super().__init__()
+
+ self.n_attractors = n_attractors
+ self.n_bins = n_bins
+ self.min_depth = min_depth
+ self.max_depth = max_depth
+ self.alpha = alpha
+ self.gamma = gamma
+ self.kind = kind
+ self.attractor_type = attractor_type
+ self.memory_efficient = memory_efficient
+
+ self._net = nn.Sequential(
+ nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0),
+ nn.Softplus()
+ )
+
+ def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
+ """
+ Args:
+ x (torch.Tensor) : feature block; shape - n, c, h, w
+ b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w
+
+ Returns:
+ tuple(torch.Tensor,torch.Tensor) : new bin centers unbounded; shape - n, nbins, h, w. Two outputs just to keep the API consistent with the normed version
+ """
+ if prev_b_embedding is not None:
+ if interpolate:
+ prev_b_embedding = nn.functional.interpolate(
+ prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
+ x = x + prev_b_embedding
+
+ A = self._net(x)
+ n, c, h, w = A.shape
+
+ b_prev = nn.functional.interpolate(
+ b_prev, (h, w), mode='bilinear', align_corners=True)
+ b_centers = b_prev
+
+ if self.attractor_type == 'exp':
+ dist = exp_attractor
+ else:
+ dist = inv_attractor
+
+ if not self.memory_efficient:
+ func = {'mean': torch.mean, 'sum': torch.sum}[self.kind]
+ # .shape N, nbins, h, w
+ delta_c = func(
+ dist(A.unsqueeze(2) - b_centers.unsqueeze(1)), dim=1)
+ else:
+ delta_c = torch.zeros_like(b_centers, device=b_centers.device)
+ for i in range(self.n_attractors):
+ delta_c += dist(A[:, i, ...].unsqueeze(1) -
+ b_centers) # .shape N, nbins, h, w
+
+ if self.kind == 'mean':
+ delta_c = delta_c / self.n_attractors
+
+ b_new_centers = b_centers + delta_c
+ B_centers = b_new_centers
+
+ return b_new_centers, B_centers
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/dist_layers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/dist_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5ff793e94aeb43aa554ef8c2392080df5572e19
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/dist_layers.py
@@ -0,0 +1,121 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+
+
+def log_binom(n, k, eps=1e-7):
+ """ log(nCk) using stirling approximation """
+ n = n + eps
+ k = k + eps
+ return n * torch.log(n) - k * torch.log(k) - (n-k) * torch.log(n-k+eps)
+
+
+class LogBinomial(nn.Module):
+ def __init__(self, n_classes=256, act=torch.softmax):
+ """Compute log binomial distribution for n_classes
+
+ Args:
+ n_classes (int, optional): number of output classes. Defaults to 256.
+ """
+ super().__init__()
+ self.K = n_classes
+ self.act = act
+ self.register_buffer('k_idx', torch.arange(
+ 0, n_classes).view(1, -1, 1, 1))
+ self.register_buffer('K_minus_1', torch.Tensor(
+ [self.K-1]).view(1, -1, 1, 1))
+
+ def forward(self, x, t=1., eps=1e-4):
+ """Compute log binomial distribution for x
+
+ Args:
+ x (torch.Tensor - NCHW): probabilities
+ t (float, torch.Tensor - NCHW, optional): Temperature of distribution. Defaults to 1..
+ eps (float, optional): Small number for numerical stability. Defaults to 1e-4.
+
+ Returns:
+ torch.Tensor -NCHW: log binomial distribution logbinomial(p;t)
+ """
+ if x.ndim == 3:
+ x = x.unsqueeze(1) # make it nchw
+
+ one_minus_x = torch.clamp(1 - x, eps, 1)
+ x = torch.clamp(x, eps, 1)
+ y = log_binom(self.K_minus_1, self.k_idx) + self.k_idx * \
+ torch.log(x) + (self.K - 1 - self.k_idx) * torch.log(one_minus_x)
+ return self.act(y/t, dim=1)
+
+
+class ConditionalLogBinomial(nn.Module):
+ def __init__(self, in_features, condition_dim, n_classes=256, bottleneck_factor=2, p_eps=1e-4, max_temp=50, min_temp=1e-7, act=torch.softmax):
+ """Conditional Log Binomial distribution
+
+ Args:
+ in_features (int): number of input channels in main feature
+ condition_dim (int): number of input channels in condition feature
+ n_classes (int, optional): Number of classes. Defaults to 256.
+ bottleneck_factor (int, optional): Hidden dim factor. Defaults to 2.
+ p_eps (float, optional): small eps value. Defaults to 1e-4.
+ max_temp (float, optional): Maximum temperature of output distribution. Defaults to 50.
+ min_temp (float, optional): Minimum temperature of output distribution. Defaults to 1e-7.
+ """
+ super().__init__()
+ self.p_eps = p_eps
+ self.max_temp = max_temp
+ self.min_temp = min_temp
+ self.log_binomial_transform = LogBinomial(n_classes, act=act)
+ bottleneck = (in_features + condition_dim) // bottleneck_factor
+ self.mlp = nn.Sequential(
+ nn.Conv2d(in_features + condition_dim, bottleneck,
+ kernel_size=1, stride=1, padding=0),
+ nn.GELU(),
+ # 2 for p linear norm, 2 for t linear norm
+ nn.Conv2d(bottleneck, 2+2, kernel_size=1, stride=1, padding=0),
+ nn.Softplus()
+ )
+
+ def forward(self, x, cond):
+ """Forward pass
+
+ Args:
+ x (torch.Tensor - NCHW): Main feature
+ cond (torch.Tensor - NCHW): condition feature
+
+ Returns:
+ torch.Tensor: Output log binomial distribution
+ """
+ pt = self.mlp(torch.concat((x, cond), dim=1))
+ p, t = pt[:, :2, ...], pt[:, 2:, ...]
+
+ p = p + self.p_eps
+ p = p[:, 0, ...] / (p[:, 0, ...] + p[:, 1, ...])
+
+ t = t + self.p_eps
+ t = t[:, 0, ...] / (t[:, 0, ...] + t[:, 1, ...])
+ t = t.unsqueeze(1)
+ t = (self.max_temp - self.min_temp) * t + self.min_temp
+
+ return self.log_binomial_transform(p, t)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/localbins_layers.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/localbins_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..cda7b1f9c74ac0d54733ef6d1dcadc1c62ab4647
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/localbins_layers.py
@@ -0,0 +1,169 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+
+
+class SeedBinRegressor(nn.Module):
+ def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
+ """Bin center regressor network. Bin centers are bounded on (min_depth, max_depth) interval.
+
+ Args:
+ in_features (int): input channels
+ n_bins (int, optional): Number of bin centers. Defaults to 16.
+ mlp_dim (int, optional): Hidden dimension. Defaults to 256.
+ min_depth (float, optional): Min depth value. Defaults to 1e-3.
+ max_depth (float, optional): Max depth value. Defaults to 10.
+ """
+ super().__init__()
+ self.version = "1_1"
+ self.min_depth = min_depth
+ self.max_depth = max_depth
+
+ self._net = nn.Sequential(
+ nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(mlp_dim, n_bins, 1, 1, 0),
+ nn.ReLU(inplace=True)
+ )
+
+ def forward(self, x):
+ """
+ Returns tensor of bin_width vectors (centers). One vector b for every pixel
+ """
+ B = self._net(x)
+ eps = 1e-3
+ B = B + eps
+ B_widths_normed = B / B.sum(dim=1, keepdim=True)
+ B_widths = (self.max_depth - self.min_depth) * \
+ B_widths_normed # .shape NCHW
+ # pad has the form (left, right, top, bottom, front, back)
+ B_widths = nn.functional.pad(
+ B_widths, (0, 0, 0, 0, 1, 0), mode='constant', value=self.min_depth)
+ B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW
+
+ B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:, 1:, ...])
+ return B_widths_normed, B_centers
+
+
+class SeedBinRegressorUnnormed(nn.Module):
+ def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
+ """Bin center regressor network. Bin centers are unbounded
+
+ Args:
+ in_features (int): input channels
+ n_bins (int, optional): Number of bin centers. Defaults to 16.
+ mlp_dim (int, optional): Hidden dimension. Defaults to 256.
+ min_depth (float, optional): Not used. (for compatibility with SeedBinRegressor)
+ max_depth (float, optional): Not used. (for compatibility with SeedBinRegressor)
+ """
+ super().__init__()
+ self.version = "1_1"
+ self._net = nn.Sequential(
+ nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(mlp_dim, n_bins, 1, 1, 0),
+ nn.Softplus()
+ )
+
+ def forward(self, x):
+ """
+ Returns tensor of bin_width vectors (centers). One vector b for every pixel
+ """
+ B_centers = self._net(x)
+ return B_centers, B_centers
+
+
+class Projector(nn.Module):
+ def __init__(self, in_features, out_features, mlp_dim=128):
+ """Projector MLP
+
+ Args:
+ in_features (int): input channels
+ out_features (int): output channels
+ mlp_dim (int, optional): hidden dimension. Defaults to 128.
+ """
+ super().__init__()
+
+ self._net = nn.Sequential(
+ nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(mlp_dim, out_features, 1, 1, 0),
+ )
+
+ def forward(self, x):
+ return self._net(x)
+
+
+
+class LinearSplitter(nn.Module):
+ def __init__(self, in_features, prev_nbins, split_factor=2, mlp_dim=128, min_depth=1e-3, max_depth=10):
+ super().__init__()
+
+ self.prev_nbins = prev_nbins
+ self.split_factor = split_factor
+ self.min_depth = min_depth
+ self.max_depth = max_depth
+
+ self._net = nn.Sequential(
+ nn.Conv2d(in_features, mlp_dim, 1, 1, 0),
+ nn.GELU(),
+ nn.Conv2d(mlp_dim, prev_nbins * split_factor, 1, 1, 0),
+ nn.ReLU()
+ )
+
+ def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False):
+ """
+ x : feature block; shape - n, c, h, w
+ b_prev : previous bin widths normed; shape - n, prev_nbins, h, w
+ """
+ if prev_b_embedding is not None:
+ if interpolate:
+ prev_b_embedding = nn.functional.interpolate(prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True)
+ x = x + prev_b_embedding
+ S = self._net(x)
+ eps = 1e-3
+ S = S + eps
+ n, c, h, w = S.shape
+ S = S.view(n, self.prev_nbins, self.split_factor, h, w)
+ S_normed = S / S.sum(dim=2, keepdim=True) # fractional splits
+
+ b_prev = nn.functional.interpolate(b_prev, (h,w), mode='bilinear', align_corners=True)
+
+
+ b_prev = b_prev / b_prev.sum(dim=1, keepdim=True) # renormalize for gurantees
+ # print(b_prev.shape, S_normed.shape)
+ # if is_for_query:(1).expand(-1, b_prev.size(0)//n, -1, -1, -1, -1).flatten(0,1) # TODO ? can replace all this with a single torch.repeat?
+ b = b_prev.unsqueeze(2) * S_normed
+ b = b.flatten(1,2) # .shape n, prev_nbins * split_factor, h, w
+
+ # calculate bin centers for loss calculation
+ B_widths = (self.max_depth - self.min_depth) * b # .shape N, nprev * splitfactor, H, W
+ # pad has the form (left, right, top, bottom, front, back)
+ B_widths = nn.functional.pad(B_widths, (0,0,0,0,1,0), mode='constant', value=self.min_depth)
+ B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW
+
+ B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:,1:,...])
+ return b, B_centers
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/patch_transformer.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/patch_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..eacac4d38809db2b3c669d0134f91645225af531
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/layers/patch_transformer.py
@@ -0,0 +1,91 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+import torch.nn as nn
+
+
+class PatchTransformerEncoder(nn.Module):
+ def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False):
+ """ViT-like transformer block
+
+ Args:
+ in_channels (int): Input channels
+ patch_size (int, optional): patch size. Defaults to 10.
+ embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128.
+ num_heads (int, optional): number of attention heads. Defaults to 4.
+ use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False.
+ """
+ super(PatchTransformerEncoder, self).__init__()
+ self.use_class_token = use_class_token
+ encoder_layers = nn.TransformerEncoderLayer(
+ embedding_dim, num_heads, dim_feedforward=1024)
+ self.transformer_encoder = nn.TransformerEncoder(
+ encoder_layers, num_layers=4) # takes shape S,N,E
+
+ self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim,
+ kernel_size=patch_size, stride=patch_size, padding=0)
+
+ def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'):
+ """Generate positional encodings
+
+ Args:
+ sequence_length (int): Sequence length
+ embedding_dim (int): Embedding dimension
+
+ Returns:
+ torch.Tensor SBE: Positional encodings
+ """
+ position = torch.arange(
+ 0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1)
+ index = torch.arange(
+ 0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0)
+ div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
+ pos_encoding = position * div_term
+ pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
+ pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1)
+ return pos_encoding
+
+
+ def forward(self, x):
+ """Forward pass
+
+ Args:
+ x (torch.Tensor - NCHW): Input feature tensor
+
+ Returns:
+ torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim
+ """
+ embeddings = self.embedding_convPxP(x).flatten(
+ 2) # .shape = n,c,s = n, embedding_dim, s
+ if self.use_class_token:
+ # extra special token at start ?
+ embeddings = nn.functional.pad(embeddings, (1, 0))
+
+ # change to S,N,E format required by transformer
+ embeddings = embeddings.permute(2, 0, 1)
+ S, N, E = embeddings.shape
+ embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device)
+ x = self.transformer_encoder(embeddings) # .shape = S, N, E
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/model_io.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/model_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63a9a1ff09e98da78a9a3da63e58509471f728d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/model_io.py
@@ -0,0 +1,92 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import torch
+
+def load_state_dict(model, state_dict):
+ """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict.
+
+ DataParallel prefixes state_dict keys with 'module.' when saving.
+ If the model is not a DataParallel model but the state_dict is, then prefixes are removed.
+ If the model is a DataParallel model but the state_dict is not, then prefixes are added.
+ """
+ state_dict = state_dict.get('model', state_dict)
+ # if model is a DataParallel model, then state_dict keys are prefixed with 'module.'
+
+ do_prefix = isinstance(
+ model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel))
+ state = {}
+ for k, v in state_dict.items():
+ if k.startswith('module.') and not do_prefix:
+ k = k[7:]
+
+ if not k.startswith('module.') and do_prefix:
+ k = 'module.' + k
+
+ state[k] = v
+
+ model.load_state_dict(state)
+ print("Loaded successfully")
+ return model
+
+
+def load_wts(model, checkpoint_path):
+ ckpt = torch.load(checkpoint_path, map_location='cpu')
+ return load_state_dict(model, ckpt)
+
+
+def load_state_dict_from_url(model, url, **kwargs):
+ state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu', **kwargs)
+ return load_state_dict(model, state_dict)
+
+
+def load_state_from_resource(model, resource: str):
+ """Loads weights to the model from a given resource. A resource can be of following types:
+ 1. URL. Prefixed with "url::"
+ e.g. url::http(s)://url.resource.com/ckpt.pt
+
+ 2. Local path. Prefixed with "local::"
+ e.g. local::/path/to/ckpt.pt
+
+
+ Args:
+ model (torch.nn.Module): Model
+ resource (str): resource string
+
+ Returns:
+ torch.nn.Module: Model with loaded weights
+ """
+ print(f"Using pretrained resource {resource}")
+
+ if resource.startswith('url::'):
+ url = resource.split('url::')[1]
+ return load_state_dict_from_url(model, url, progress=True)
+
+ elif resource.startswith('local::'):
+ path = resource.split('local::')[1]
+ return load_wts(model, path)
+
+ else:
+ raise ValueError("Invalid resource type, only url:: and local:: are supported")
+
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e9a694852aaa28c500419d413ea8a572338e18
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/__init__.py
@@ -0,0 +1,31 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+from .zoedepth_v1 import ZoeDepth
+
+all_versions = {
+ "v1": ZoeDepth,
+}
+
+get_version = lambda v : all_versions[v]
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/config_zoedepth.json b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/config_zoedepth.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfc9fa7b17615cf557b6ad01c8fedd6c0c32e88f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/config_zoedepth.json
@@ -0,0 +1,58 @@
+{
+ "model": {
+ "name": "ZoeDepth",
+ "version_name": "v1",
+ "n_bins": 64,
+ "bin_embedding_dim": 128,
+ "bin_centers_type": "softplus",
+ "n_attractors":[16, 8, 4, 1],
+ "attractor_alpha": 1000,
+ "attractor_gamma": 2,
+ "attractor_kind" : "mean",
+ "attractor_type" : "inv",
+ "midas_model_type" : "DPT_BEiT_L_384",
+ "min_temp": 0.0212,
+ "max_temp": 50.0,
+ "output_distribution": "logbinomial",
+ "memory_efficient": true,
+ "inverse_midas": false,
+ "img_size": [384, 512]
+ },
+
+ "train": {
+ "train_midas": true,
+ "use_pretrained_midas": true,
+ "trainer": "zoedepth",
+ "epochs": 5,
+ "bs": 16,
+ "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
+ "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
+ "same_lr": false,
+ "w_si": 1,
+ "w_domain": 0.2,
+ "w_reg": 0,
+ "w_grad": 0,
+ "avoid_boundary": false,
+ "random_crop": false,
+ "input_width": 640,
+ "input_height": 480,
+ "midas_lr_factor": 1,
+ "encoder_lr_factor":10,
+ "pos_enc_lr_factor":10,
+ "freeze_midas_bn": true
+
+ },
+
+ "infer":{
+ "train_midas": false,
+ "use_pretrained_midas": false,
+ "pretrained_resource" : null,
+ "force_keep_ar": true
+ },
+
+ "eval":{
+ "train_midas": false,
+ "use_pretrained_midas": false,
+ "pretrained_resource" : null
+ }
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e7266ec2d7e918143f54ee728ea4d8d4e9adb11
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json
@@ -0,0 +1,22 @@
+{
+ "model": {
+ "bin_centers_type": "normed",
+ "img_size": [384, 768]
+ },
+
+ "train": {
+ },
+
+ "infer":{
+ "train_midas": false,
+ "use_pretrained_midas": false,
+ "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt",
+ "force_keep_ar": true
+ },
+
+ "eval":{
+ "train_midas": false,
+ "use_pretrained_midas": false,
+ "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
+ }
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/zoedepth_v1.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/zoedepth_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5e8c2d272c3ae5f5ff1025aa96e6653b23d82f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth/zoedepth_v1.py
@@ -0,0 +1,250 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import itertools
+
+import torch
+import torch.nn as nn
+from ..depth_model import DepthModel
+from ..base_models.midas import MidasCore
+from ..layers.attractor import AttractorLayer, AttractorLayerUnnormed
+from ..layers.dist_layers import ConditionalLogBinomial
+from ..layers.localbins_layers import (Projector, SeedBinRegressor,
+ SeedBinRegressorUnnormed)
+from ..model_io import load_state_from_resource
+
+
+class ZoeDepth(DepthModel):
+ def __init__(self, core, n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10,
+ n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True,
+ midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs):
+ """ZoeDepth model. This is the version of ZoeDepth that has a single metric head
+
+ Args:
+ core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
+ n_bins (int, optional): Number of bin centers. Defaults to 64.
+ bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
+ For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus".
+ bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
+ min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3.
+ max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10.
+ n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
+ attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
+ attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
+ attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
+ attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
+ min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
+ max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
+ train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
+ midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
+ encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
+ pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
+ """
+ super().__init__()
+
+ self.core = core
+ self.max_depth = max_depth
+ self.min_depth = min_depth
+ self.min_temp = min_temp
+ self.bin_centers_type = bin_centers_type
+
+ self.midas_lr_factor = midas_lr_factor
+ self.encoder_lr_factor = encoder_lr_factor
+ self.pos_enc_lr_factor = pos_enc_lr_factor
+ self.train_midas = train_midas
+ self.inverse_midas = inverse_midas
+
+ if self.encoder_lr_factor <= 0:
+ self.core.freeze_encoder(
+ freeze_rel_pos=self.pos_enc_lr_factor <= 0)
+
+ N_MIDAS_OUT = 32
+ btlnck_features = self.core.output_channels[0]
+ num_out_features = self.core.output_channels[1:]
+
+ self.conv2 = nn.Conv2d(btlnck_features, btlnck_features,
+ kernel_size=1, stride=1, padding=0) # btlnck conv
+
+ if bin_centers_type == "normed":
+ SeedBinRegressorLayer = SeedBinRegressor
+ Attractor = AttractorLayer
+ elif bin_centers_type == "softplus":
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
+ Attractor = AttractorLayerUnnormed
+ elif bin_centers_type == "hybrid1":
+ SeedBinRegressorLayer = SeedBinRegressor
+ Attractor = AttractorLayerUnnormed
+ elif bin_centers_type == "hybrid2":
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
+ Attractor = AttractorLayer
+ else:
+ raise ValueError(
+ "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
+
+ self.seed_bin_regressor = SeedBinRegressorLayer(
+ btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth)
+ self.seed_projector = Projector(btlnck_features, bin_embedding_dim)
+ self.projectors = nn.ModuleList([
+ Projector(num_out, bin_embedding_dim)
+ for num_out in num_out_features
+ ])
+ self.attractors = nn.ModuleList([
+ Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth,
+ alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type)
+ for i in range(len(num_out_features))
+ ])
+
+ last_in = N_MIDAS_OUT + 1 # +1 for relative depth
+
+ # use log binomial instead of softmax
+ self.conditional_log_binomial = ConditionalLogBinomial(
+ last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
+
+ def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
+ """
+ Args:
+ x (torch.Tensor): Input image tensor of shape (B, C, H, W)
+ return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False.
+ denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False.
+ return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False.
+
+ Returns:
+ dict: Dictionary containing the following keys:
+ - rel_depth (torch.Tensor): Relative depth map of shape (B, H, W)
+ - metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W)
+ - bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True
+ - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True
+
+ """
+ b, c, h, w = x.shape
+ # print("input shape ", x.shape)
+ self.orig_input_width = w
+ self.orig_input_height = h
+ rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
+ # print("output shapes", rel_depth.shape, out.shape)
+
+ outconv_activation = out[0]
+ btlnck = out[1]
+ x_blocks = out[2:]
+
+ x_d0 = self.conv2(btlnck)
+ x = x_d0
+ _, seed_b_centers = self.seed_bin_regressor(x)
+
+ if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
+ b_prev = (seed_b_centers - self.min_depth) / \
+ (self.max_depth - self.min_depth)
+ else:
+ b_prev = seed_b_centers
+
+ prev_b_embedding = self.seed_projector(x)
+
+ # unroll this loop for better performance
+ for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks):
+ b_embedding = projector(x)
+ b, b_centers = attractor(
+ b_embedding, b_prev, prev_b_embedding, interpolate=True)
+ b_prev = b.clone()
+ prev_b_embedding = b_embedding.clone()
+
+ last = outconv_activation
+
+ if self.inverse_midas:
+ # invert depth followed by normalization
+ rel_depth = 1.0 / (rel_depth + 1e-6)
+ rel_depth = (rel_depth - rel_depth.min()) / \
+ (rel_depth.max() - rel_depth.min())
+ # concat rel depth with last. First interpolate rel depth to last size
+ rel_cond = rel_depth.unsqueeze(1)
+ rel_cond = nn.functional.interpolate(
+ rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
+ last = torch.cat([last, rel_cond], dim=1)
+
+ b_embedding = nn.functional.interpolate(
+ b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+ x = self.conditional_log_binomial(last, b_embedding)
+
+ # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+ # print(x.shape, b_centers.shape)
+ b_centers = nn.functional.interpolate(
+ b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+ out = torch.sum(x * b_centers, dim=1, keepdim=True)
+
+ # Structure output dict
+ output = dict(metric_depth=out)
+ if return_final_centers or return_probs:
+ output['bin_centers'] = b_centers
+
+ if return_probs:
+ output['probs'] = x
+
+ return output
+
+ def get_lr_params(self, lr):
+ """
+ Learning rate configuration for different layers of the model
+ Args:
+ lr (float) : Base learning rate
+ Returns:
+ list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
+ """
+ param_conf = []
+ if self.train_midas:
+ if self.encoder_lr_factor > 0:
+ param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
+ ), 'lr': lr / self.encoder_lr_factor})
+
+ if self.pos_enc_lr_factor > 0:
+ param_conf.append(
+ {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
+
+ midas_params = self.core.core.scratch.parameters()
+ midas_lr_factor = self.midas_lr_factor
+ param_conf.append(
+ {'params': midas_params, 'lr': lr / midas_lr_factor})
+
+ remaining_modules = []
+ for name, child in self.named_children():
+ if name != 'core':
+ remaining_modules.append(child)
+ remaining_params = itertools.chain(
+ *[child.parameters() for child in remaining_modules])
+
+ param_conf.append({'params': remaining_params, 'lr': lr})
+
+ return param_conf
+
+ @staticmethod
+ def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
+ core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+ train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+ model = ZoeDepth(core, **kwargs)
+ if pretrained_resource:
+ assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
+ model = load_state_from_resource(model, pretrained_resource)
+ return model
+
+ @staticmethod
+ def build_from_config(config):
+ return ZoeDepth.build(**config)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_anything/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_anything/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e9a694852aaa28c500419d413ea8a572338e18
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_anything/__init__.py
@@ -0,0 +1,31 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+from .zoedepth_v1 import ZoeDepth
+
+all_versions = {
+ "v1": ZoeDepth,
+}
+
+get_version = lambda v : all_versions[v]
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_anything/zoedepth_v1.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_anything/zoedepth_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e9b351f7fe1ec653dd4aee4ba517c44eb0ba60
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_anything/zoedepth_v1.py
@@ -0,0 +1,264 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import itertools
+
+import torch
+import torch.nn as nn
+from ..depth_model import DepthModel
+from ..base_models.midas import MidasCore
+from ..base_models.depth_anything import DepthAnythingCore
+from ..layers.attractor import AttractorLayer, AttractorLayerUnnormed
+from ..layers.dist_layers import ConditionalLogBinomial
+from ..layers.localbins_layers import (Projector, SeedBinRegressor,
+ SeedBinRegressorUnnormed)
+from ..model_io import load_state_from_resource
+
+
+class ZoeDepth(DepthModel):
+ def __init__(self, core, n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10,
+ n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True,
+ midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs):
+ """ZoeDepth model. This is the version of ZoeDepth that has a single metric head
+
+ Args:
+ core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
+ n_bins (int, optional): Number of bin centers. Defaults to 64.
+ bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
+ For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus".
+ bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
+ min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3.
+ max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10.
+ n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
+ attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
+ attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
+ attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
+ attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
+ min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
+ max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
+ train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
+ midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
+ encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
+ pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
+ """
+ super().__init__()
+
+ self.core = core
+ self.max_depth = max_depth
+ self.min_depth = min_depth
+ self.min_temp = min_temp
+ self.bin_centers_type = bin_centers_type
+
+ self.midas_lr_factor = midas_lr_factor
+ self.encoder_lr_factor = encoder_lr_factor
+ self.pos_enc_lr_factor = pos_enc_lr_factor
+ self.train_midas = train_midas
+ self.inverse_midas = inverse_midas
+
+ if self.encoder_lr_factor <= 0:
+ self.core.freeze_encoder(
+ freeze_rel_pos=self.pos_enc_lr_factor <= 0)
+
+ N_MIDAS_OUT = 32
+ btlnck_features = self.core.output_channels[0]
+ num_out_features = self.core.output_channels[1:]
+
+ # print('core output channels:', self.core.output_channels)
+
+ self.conv2 = nn.Conv2d(btlnck_features, btlnck_features,
+ kernel_size=1, stride=1, padding=0) # btlnck conv
+
+ if bin_centers_type == "normed":
+ SeedBinRegressorLayer = SeedBinRegressor
+ Attractor = AttractorLayer
+ elif bin_centers_type == "softplus":
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
+ Attractor = AttractorLayerUnnormed
+ elif bin_centers_type == "hybrid1":
+ SeedBinRegressorLayer = SeedBinRegressor
+ Attractor = AttractorLayerUnnormed
+ elif bin_centers_type == "hybrid2":
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
+ Attractor = AttractorLayer
+ else:
+ raise ValueError(
+ "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
+
+ self.seed_bin_regressor = SeedBinRegressorLayer(
+ btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth)
+ self.seed_projector = Projector(btlnck_features, bin_embedding_dim)
+ self.projectors = nn.ModuleList([
+ Projector(num_out, bin_embedding_dim)
+ for num_out in num_out_features
+ ])
+ self.attractors = nn.ModuleList([
+ Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth,
+ alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type)
+ for i in range(len(num_out_features))
+ ])
+
+ last_in = N_MIDAS_OUT + 1 # +1 for relative depth
+
+ # use log binomial instead of softmax
+ self.conditional_log_binomial = ConditionalLogBinomial(
+ last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp)
+
+ def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
+ """
+ Args:
+ x (torch.Tensor): Input image tensor of shape (B, C, H, W)
+ return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False.
+ denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False.
+ return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False.
+
+ Returns:
+ dict: Dictionary containing the following keys:
+ - rel_depth (torch.Tensor): Relative depth map of shape (B, H, W)
+ - metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W)
+ - bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True
+ - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True
+
+ """
+ # print('input shape', x.shape)
+
+ b, c, h, w = x.shape
+ # print("input shape:", x.shape)
+ self.orig_input_width = w
+ self.orig_input_height = h
+ rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
+ # print("output shapes", rel_depth.shape, out.shape)
+ # print('rel_depth shape:', rel_depth.shape)
+ # print('out type:', type(out))
+ # for k in range(len(out)):
+ # print(k, out[k].shape)
+
+ outconv_activation = out[0]
+ btlnck = out[1]
+ x_blocks = out[2:]
+
+ x_d0 = self.conv2(btlnck)
+ x = x_d0
+ _, seed_b_centers = self.seed_bin_regressor(x)
+
+ if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
+ b_prev = (seed_b_centers - self.min_depth) / \
+ (self.max_depth - self.min_depth)
+ else:
+ b_prev = seed_b_centers
+
+ prev_b_embedding = self.seed_projector(x)
+
+ # unroll this loop for better performance
+ for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks):
+ b_embedding = projector(x)
+ b, b_centers = attractor(
+ b_embedding, b_prev, prev_b_embedding, interpolate=True)
+ b_prev = b.clone()
+ prev_b_embedding = b_embedding.clone()
+
+ last = outconv_activation
+
+ if self.inverse_midas:
+ # invert depth followed by normalization
+ rel_depth = 1.0 / (rel_depth + 1e-6)
+ rel_depth = (rel_depth - rel_depth.min()) / \
+ (rel_depth.max() - rel_depth.min())
+ # concat rel depth with last. First interpolate rel depth to last size
+ rel_cond = rel_depth.unsqueeze(1)
+ rel_cond = nn.functional.interpolate(
+ rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True)
+ last = torch.cat([last, rel_cond], dim=1)
+
+ b_embedding = nn.functional.interpolate(
+ b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+ x = self.conditional_log_binomial(last, b_embedding)
+
+ # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+ # print(x.shape, b_centers.shape)
+ b_centers = nn.functional.interpolate(
+ b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+ out = torch.sum(x * b_centers, dim=1, keepdim=True)
+
+ # Structure output dict
+ output = dict(metric_depth=out)
+ if return_final_centers or return_probs:
+ output['bin_centers'] = b_centers
+
+ if return_probs:
+ output['probs'] = x
+
+ return output
+
+ def get_lr_params(self, lr):
+ """
+ Learning rate configuration for different layers of the model
+ Args:
+ lr (float) : Base learning rate
+ Returns:
+ list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
+ """
+ param_conf = []
+ if self.train_midas:
+ if self.encoder_lr_factor > 0:
+ param_conf.append({'params': self.core.get_enc_params_except_rel_pos(
+ ), 'lr': lr / self.encoder_lr_factor})
+
+ if self.pos_enc_lr_factor > 0:
+ param_conf.append(
+ {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor})
+
+ # midas_params = self.core.core.scratch.parameters()
+ midas_params = self.core.core.depth_head.parameters()
+ midas_lr_factor = self.midas_lr_factor
+ param_conf.append(
+ {'params': midas_params, 'lr': lr / midas_lr_factor})
+
+ remaining_modules = []
+ for name, child in self.named_children():
+ if name != 'core':
+ remaining_modules.append(child)
+ remaining_params = itertools.chain(
+ *[child.parameters() for child in remaining_modules])
+
+ param_conf.append({'params': remaining_params, 'lr': lr})
+
+ return param_conf
+
+ @staticmethod
+ def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
+ # core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+ # train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+
+ core = DepthAnythingCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+ train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+
+ model = ZoeDepth(core, **kwargs)
+ if pretrained_resource:
+ assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
+ model = load_state_from_resource(model, pretrained_resource)
+ return model
+
+ @staticmethod
+ def build_from_config(config):
+ return ZoeDepth.build(**config)
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..56692da246c65d3c390236faa9ee1bf97040b824
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/__init__.py
@@ -0,0 +1,31 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+from .zoedepth_nk_v1 import ZoeDepthNK
+
+all_versions = {
+ "v1": ZoeDepthNK,
+}
+
+get_version = lambda v : all_versions[v]
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae036e38243566e0bb79a4821e4897d9bc4aaae1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json
@@ -0,0 +1,67 @@
+{
+ "model": {
+ "name": "ZoeDepthNK",
+ "version_name": "v1",
+ "bin_conf" : [
+ {
+ "name": "nyu",
+ "n_bins": 64,
+ "min_depth": 1e-3,
+ "max_depth": 10.0
+ },
+ {
+ "name": "kitti",
+ "n_bins": 64,
+ "min_depth": 1e-3,
+ "max_depth": 80.0
+ }
+ ],
+ "bin_embedding_dim": 128,
+ "bin_centers_type": "softplus",
+ "n_attractors":[16, 8, 4, 1],
+ "attractor_alpha": 1000,
+ "attractor_gamma": 2,
+ "attractor_kind" : "mean",
+ "attractor_type" : "inv",
+ "min_temp": 0.0212,
+ "max_temp": 50.0,
+ "memory_efficient": true,
+ "midas_model_type" : "DPT_BEiT_L_384",
+ "img_size": [384, 512]
+ },
+
+ "train": {
+ "train_midas": true,
+ "use_pretrained_midas": true,
+ "trainer": "zoedepth_nk",
+ "epochs": 5,
+ "bs": 16,
+ "optim_kwargs": {"lr": 0.0002512, "wd": 0.01},
+ "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
+ "same_lr": false,
+ "w_si": 1,
+ "w_domain": 100,
+ "avoid_boundary": false,
+ "random_crop": false,
+ "input_width": 640,
+ "input_height": 480,
+ "w_grad": 0,
+ "w_reg": 0,
+ "midas_lr_factor": 10,
+ "encoder_lr_factor":10,
+ "pos_enc_lr_factor":10
+ },
+
+ "infer": {
+ "train_midas": false,
+ "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
+ "use_pretrained_midas": false,
+ "force_keep_ar": true
+ },
+
+ "eval": {
+ "train_midas": false,
+ "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
+ "use_pretrained_midas": false
+ }
+}
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0bfd914c005e06ab67638941d423ef2058e242
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py
@@ -0,0 +1,333 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import itertools
+
+import torch
+import torch.nn as nn
+
+from zoedepth.models.depth_model import DepthModel
+from zoedepth.models.base_models.midas import MidasCore
+from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed
+from zoedepth.models.layers.dist_layers import ConditionalLogBinomial
+from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor,
+ SeedBinRegressorUnnormed)
+from zoedepth.models.layers.patch_transformer import PatchTransformerEncoder
+from zoedepth.models.model_io import load_state_from_resource
+
+
+class ZoeDepthNK(DepthModel):
+ def __init__(self, core, bin_conf, bin_centers_type="softplus", bin_embedding_dim=128,
+ n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp',
+ min_temp=5, max_temp=50,
+ memory_efficient=False, train_midas=True,
+ is_midas_pretrained=True, midas_lr_factor=1, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs):
+ """ZoeDepthNK model. This is the version of ZoeDepth that has two metric heads and uses a learned router to route to experts.
+
+ Args:
+ core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features
+
+ bin_conf (List[dict]): A list of dictionaries that contain the bin configuration for each metric head. Each dictionary should contain the following keys:
+ "name" (str, typically same as the dataset name), "n_bins" (int), "min_depth" (float), "max_depth" (float)
+
+ The length of this list determines the number of metric heads.
+ bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers.
+ For "softplus", softplus activation is used and thus are unbounded. Defaults to "normed".
+ bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128.
+
+ n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1].
+ attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300.
+ attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2.
+ attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'.
+ attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'.
+
+ min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5.
+ max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50.
+
+ memory_efficient (bool, optional): Whether to use memory efficient version of attractor layers. Memory efficient version is slower but is recommended incase of multiple metric heads in order save GPU memory. Defaults to False.
+
+ train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True.
+ is_midas_pretrained (bool, optional): Is "core" pretrained? Defaults to True.
+ midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10.
+ encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10.
+ pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10.
+
+ """
+
+ super().__init__()
+
+ self.core = core
+ self.bin_conf = bin_conf
+ self.min_temp = min_temp
+ self.max_temp = max_temp
+ self.memory_efficient = memory_efficient
+ self.train_midas = train_midas
+ self.is_midas_pretrained = is_midas_pretrained
+ self.midas_lr_factor = midas_lr_factor
+ self.encoder_lr_factor = encoder_lr_factor
+ self.pos_enc_lr_factor = pos_enc_lr_factor
+ self.inverse_midas = inverse_midas
+
+ N_MIDAS_OUT = 32
+ btlnck_features = self.core.output_channels[0]
+ num_out_features = self.core.output_channels[1:]
+ # self.scales = [16, 8, 4, 2] # spatial scale factors
+
+ self.conv2 = nn.Conv2d(
+ btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0)
+
+ # Transformer classifier on the bottleneck
+ self.patch_transformer = PatchTransformerEncoder(
+ btlnck_features, 1, 128, use_class_token=True)
+ self.mlp_classifier = nn.Sequential(
+ nn.Linear(128, 128),
+ nn.ReLU(),
+ nn.Linear(128, 2)
+ )
+
+ if bin_centers_type == "normed":
+ SeedBinRegressorLayer = SeedBinRegressor
+ Attractor = AttractorLayer
+ elif bin_centers_type == "softplus":
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
+ Attractor = AttractorLayerUnnormed
+ elif bin_centers_type == "hybrid1":
+ SeedBinRegressorLayer = SeedBinRegressor
+ Attractor = AttractorLayerUnnormed
+ elif bin_centers_type == "hybrid2":
+ SeedBinRegressorLayer = SeedBinRegressorUnnormed
+ Attractor = AttractorLayer
+ else:
+ raise ValueError(
+ "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'")
+ self.bin_centers_type = bin_centers_type
+ # We have bins for each bin conf.
+ # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
+ self.seed_bin_regressors = nn.ModuleDict(
+ {conf['name']: SeedBinRegressorLayer(btlnck_features, conf["n_bins"], mlp_dim=bin_embedding_dim//2, min_depth=conf["min_depth"], max_depth=conf["max_depth"])
+ for conf in bin_conf}
+ )
+
+ self.seed_projector = Projector(
+ btlnck_features, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
+ self.projectors = nn.ModuleList([
+ Projector(num_out, bin_embedding_dim, mlp_dim=bin_embedding_dim//2)
+ for num_out in num_out_features
+ ])
+
+ # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
+ self.attractors = nn.ModuleDict(
+ {conf['name']: nn.ModuleList([
+ Attractor(bin_embedding_dim, n_attractors[i],
+ mlp_dim=bin_embedding_dim, alpha=attractor_alpha,
+ gamma=attractor_gamma, kind=attractor_kind,
+ attractor_type=attractor_type, memory_efficient=memory_efficient,
+ min_depth=conf["min_depth"], max_depth=conf["max_depth"])
+ for i in range(len(n_attractors))
+ ])
+ for conf in bin_conf}
+ )
+
+ last_in = N_MIDAS_OUT
+ # conditional log binomial for each bin conf
+ self.conditional_log_binomial = nn.ModuleDict(
+ {conf['name']: ConditionalLogBinomial(last_in, bin_embedding_dim, conf['n_bins'], bottleneck_factor=4, min_temp=self.min_temp, max_temp=self.max_temp)
+ for conf in bin_conf}
+ )
+
+ def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs):
+ """
+ Args:
+ x (torch.Tensor): Input image tensor of shape (B, C, H, W). Assumes all images are from the same domain.
+ return_final_centers (bool, optional): Whether to return the final centers of the attractors. Defaults to False.
+ denorm (bool, optional): Whether to denormalize the input image. Defaults to False.
+ return_probs (bool, optional): Whether to return the probabilities of the bins. Defaults to False.
+
+ Returns:
+ dict: Dictionary of outputs with keys:
+ - "rel_depth": Relative depth map of shape (B, 1, H, W)
+ - "metric_depth": Metric depth map of shape (B, 1, H, W)
+ - "domain_logits": Domain logits of shape (B, 2)
+ - "bin_centers": Bin centers of shape (B, N, H, W). Present only if return_final_centers is True
+ - "probs": Bin probabilities of shape (B, N, H, W). Present only if return_probs is True
+ """
+ b, c, h, w = x.shape
+ self.orig_input_width = w
+ self.orig_input_height = h
+ rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True)
+
+ outconv_activation = out[0]
+ btlnck = out[1]
+ x_blocks = out[2:]
+
+ x_d0 = self.conv2(btlnck)
+ x = x_d0
+
+ # Predict which path to take
+ embedding = self.patch_transformer(x)[0] # N, E
+ domain_logits = self.mlp_classifier(embedding) # N, 2
+ domain_vote = torch.softmax(domain_logits.sum(
+ dim=0, keepdim=True), dim=-1) # 1, 2
+
+ # Get the path
+ bin_conf_name = ["nyu", "kitti"][torch.argmax(
+ domain_vote, dim=-1).squeeze().item()]
+
+ try:
+ conf = [c for c in self.bin_conf if c.name == bin_conf_name][0]
+ except IndexError:
+ raise ValueError(
+ f"bin_conf_name {bin_conf_name} not found in bin_confs")
+
+ min_depth = conf['min_depth']
+ max_depth = conf['max_depth']
+
+ seed_bin_regressor = self.seed_bin_regressors[bin_conf_name]
+ _, seed_b_centers = seed_bin_regressor(x)
+ if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2':
+ b_prev = (seed_b_centers - min_depth)/(max_depth - min_depth)
+ else:
+ b_prev = seed_b_centers
+ prev_b_embedding = self.seed_projector(x)
+
+ attractors = self.attractors[bin_conf_name]
+ for projector, attractor, x in zip(self.projectors, attractors, x_blocks):
+ b_embedding = projector(x)
+ b, b_centers = attractor(
+ b_embedding, b_prev, prev_b_embedding, interpolate=True)
+ b_prev = b
+ prev_b_embedding = b_embedding
+
+ last = outconv_activation
+
+ b_centers = nn.functional.interpolate(
+ b_centers, last.shape[-2:], mode='bilinear', align_corners=True)
+ b_embedding = nn.functional.interpolate(
+ b_embedding, last.shape[-2:], mode='bilinear', align_corners=True)
+
+ clb = self.conditional_log_binomial[bin_conf_name]
+ x = clb(last, b_embedding)
+
+ # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+ # print(x.shape, b_centers.shape)
+ # b_centers = nn.functional.interpolate(b_centers, x.shape[-2:], mode='bilinear', align_corners=True)
+ out = torch.sum(x * b_centers, dim=1, keepdim=True)
+
+ output = dict(domain_logits=domain_logits, metric_depth=out)
+ if return_final_centers or return_probs:
+ output['bin_centers'] = b_centers
+
+ if return_probs:
+ output['probs'] = x
+ return output
+
+ def get_lr_params(self, lr):
+ """
+ Learning rate configuration for different layers of the model
+
+ Args:
+ lr (float) : Base learning rate
+ Returns:
+ list : list of parameters to optimize and their learning rates, in the format required by torch optimizers.
+ """
+ param_conf = []
+ if self.train_midas:
+ def get_rel_pos_params():
+ for name, p in self.core.core.pretrained.named_parameters():
+ if "relative_position" in name:
+ yield p
+
+ def get_enc_params_except_rel_pos():
+ for name, p in self.core.core.pretrained.named_parameters():
+ if "relative_position" not in name:
+ yield p
+
+ encoder_params = get_enc_params_except_rel_pos()
+ rel_pos_params = get_rel_pos_params()
+ midas_params = self.core.core.scratch.parameters()
+ midas_lr_factor = self.midas_lr_factor if self.is_midas_pretrained else 1.0
+ param_conf.extend([
+ {'params': encoder_params, 'lr': lr / self.encoder_lr_factor},
+ {'params': rel_pos_params, 'lr': lr / self.pos_enc_lr_factor},
+ {'params': midas_params, 'lr': lr / midas_lr_factor}
+ ])
+
+ remaining_modules = []
+ for name, child in self.named_children():
+ if name != 'core':
+ remaining_modules.append(child)
+ remaining_params = itertools.chain(
+ *[child.parameters() for child in remaining_modules])
+ param_conf.append({'params': remaining_params, 'lr': lr})
+ return param_conf
+
+ def get_conf_parameters(self, conf_name):
+ """
+ Returns parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+ """
+ params = []
+ for name, child in self.named_children():
+ if isinstance(child, nn.ModuleDict):
+ for bin_conf_name, module in child.items():
+ if bin_conf_name == conf_name:
+ params += list(module.parameters())
+ return params
+
+ def freeze_conf(self, conf_name):
+ """
+ Freezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+ """
+ for p in self.get_conf_parameters(conf_name):
+ p.requires_grad = False
+
+ def unfreeze_conf(self, conf_name):
+ """
+ Unfreezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration
+ """
+ for p in self.get_conf_parameters(conf_name):
+ p.requires_grad = True
+
+ def freeze_all_confs(self):
+ """
+ Freezes all the parameters of all the ModuleDicts children
+ """
+ for name, child in self.named_children():
+ if isinstance(child, nn.ModuleDict):
+ for bin_conf_name, module in child.items():
+ for p in module.parameters():
+ p.requires_grad = False
+
+ @staticmethod
+ def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs):
+ core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas,
+ train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs)
+ model = ZoeDepthNK(core, **kwargs)
+ if pretrained_resource:
+ assert isinstance(pretrained_resource, str), "pretrained_resource must be a string"
+ model = load_state_from_resource(model, pretrained_resource)
+ return model
+
+ @staticmethod
+ def build_from_config(config):
+ return ZoeDepthNK.build(**config)
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae1a1e4e86d9a5b14586cd006ed43d2bbc9b4a6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/arg_utils.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/arg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5124e8c617874b6457b7dbaebeec61c166577933
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/arg_utils.py
@@ -0,0 +1,33 @@
+
+
+def infer_type(x): # hacky way to infer type from string args
+ if not isinstance(x, str):
+ return x
+
+ try:
+ x = int(x)
+ return x
+ except ValueError:
+ pass
+
+ try:
+ x = float(x)
+ return x
+ except ValueError:
+ pass
+
+ return x
+
+
+def parse_unknown(unknown_args):
+ clean = []
+ for a in unknown_args:
+ if "=" in a:
+ k, v = a.split("=")
+ clean.extend([k, v])
+ else:
+ clean.append(a)
+
+ keys = clean[::2]
+ values = clean[1::2]
+ return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)}
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/config.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5cb7765994211de10946932a572e05901777d6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/config.py
@@ -0,0 +1,437 @@
+# MIT License
+
+# Copyright (c) 2022 Intelligent Systems Lab Org
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File author: Shariq Farooq Bhat
+
+import json
+import os
+
+from .easydict import EasyDict as edict
+from .arg_utils import infer_type
+
+import pathlib
+import platform
+
+ROOT = pathlib.Path(__file__).parent.parent.resolve()
+
+HOME_DIR = os.path.expanduser("~")
+
+COMMON_CONFIG = {
+ "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"),
+ "project": "ZoeDepth",
+ "tags": '',
+ "notes": "",
+ "gpu": None,
+ "root": ".",
+ "uid": None,
+ "print_losses": False
+}
+
+DATASETS_CONFIG = {
+ "kitti": {
+ "dataset": "kitti",
+ "min_depth": 0.001,
+ "max_depth": 80,
+ "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+ "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+ "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
+ "input_height": 352,
+ "input_width": 1216, # 704
+ "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+ "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+ "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",
+
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+
+ "do_random_rotate": True,
+ "degree": 1.0,
+ "do_kb_crop": True,
+ "garg_crop": True,
+ "eigen_crop": False,
+ "use_right": False
+ },
+ "kitti_test": {
+ "dataset": "kitti",
+ "min_depth": 0.001,
+ "max_depth": 80,
+ "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+ "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+ "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
+ "input_height": 352,
+ "input_width": 1216,
+ "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
+ "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
+ "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",
+
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+
+ "do_random_rotate": False,
+ "degree": 1.0,
+ "do_kb_crop": True,
+ "garg_crop": True,
+ "eigen_crop": False,
+ "use_right": False
+ },
+ "nyu": {
+ "dataset": "nyu",
+ "avoid_boundary": False,
+ "min_depth": 1e-3, # originally 0.1
+ "max_depth": 10,
+ "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
+ "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"),
+ "filenames_file": "./train_test_inputs/nyudepthv2_train_files_with_gt.txt",
+ "input_height": 480,
+ "input_width": 640,
+ "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
+ "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"),
+ "filenames_file_eval": "./train_test_inputs/nyudepthv2_test_files_with_gt.txt",
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 10,
+ "min_depth_diff": -10,
+ "max_depth_diff": 10,
+
+ "do_random_rotate": True,
+ "degree": 1.0,
+ "do_kb_crop": False,
+ "garg_crop": False,
+ "eigen_crop": True
+ },
+ "ibims": {
+ "dataset": "ibims",
+ "ibims_root": os.path.join(HOME_DIR, "shortcuts/datasets/ibims/ibims1_core_raw/"),
+ "eigen_crop": True,
+ "garg_crop": False,
+ "do_kb_crop": False,
+ "min_depth_eval": 0,
+ "max_depth_eval": 10,
+ "min_depth": 1e-3,
+ "max_depth": 10
+ },
+ "sunrgbd": {
+ "dataset": "sunrgbd",
+ "sunrgbd_root": os.path.join(HOME_DIR, "shortcuts/datasets/SUNRGBD/test/"),
+ "eigen_crop": True,
+ "garg_crop": False,
+ "do_kb_crop": False,
+ "min_depth_eval": 0,
+ "max_depth_eval": 8,
+ "min_depth": 1e-3,
+ "max_depth": 10
+ },
+ "diml_indoor": {
+ "dataset": "diml_indoor",
+ "diml_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_indoor_test/"),
+ "eigen_crop": True,
+ "garg_crop": False,
+ "do_kb_crop": False,
+ "min_depth_eval": 0,
+ "max_depth_eval": 10,
+ "min_depth": 1e-3,
+ "max_depth": 10
+ },
+ "diml_outdoor": {
+ "dataset": "diml_outdoor",
+ "diml_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_outdoor_test/"),
+ "eigen_crop": False,
+ "garg_crop": True,
+ "do_kb_crop": False,
+ "min_depth_eval": 2,
+ "max_depth_eval": 80,
+ "min_depth": 1e-3,
+ "max_depth": 80
+ },
+ "diode_indoor": {
+ "dataset": "diode_indoor",
+ "diode_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_indoor/"),
+ "eigen_crop": True,
+ "garg_crop": False,
+ "do_kb_crop": False,
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 10,
+ "min_depth": 1e-3,
+ "max_depth": 10
+ },
+ "diode_outdoor": {
+ "dataset": "diode_outdoor",
+ "diode_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_outdoor/"),
+ "eigen_crop": False,
+ "garg_crop": True,
+ "do_kb_crop": False,
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+ "min_depth": 1e-3,
+ "max_depth": 80
+ },
+ "hypersim_test": {
+ "dataset": "hypersim_test",
+ "hypersim_test_root": os.path.join(HOME_DIR, "shortcuts/datasets/hypersim_test/"),
+ "eigen_crop": True,
+ "garg_crop": False,
+ "do_kb_crop": False,
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+ "min_depth": 1e-3,
+ "max_depth": 10
+ },
+ "vkitti": {
+ "dataset": "vkitti",
+ "vkitti_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti_test/"),
+ "eigen_crop": False,
+ "garg_crop": True,
+ "do_kb_crop": True,
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+ "min_depth": 1e-3,
+ "max_depth": 80
+ },
+ "vkitti2": {
+ "dataset": "vkitti2",
+ "vkitti2_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti2/"),
+ "eigen_crop": False,
+ "garg_crop": True,
+ "do_kb_crop": True,
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+ "min_depth": 1e-3,
+ "max_depth": 80,
+ },
+ "ddad": {
+ "dataset": "ddad",
+ "ddad_root": os.path.join(HOME_DIR, "shortcuts/datasets/ddad/ddad_val/"),
+ "eigen_crop": False,
+ "garg_crop": True,
+ "do_kb_crop": True,
+ "min_depth_eval": 1e-3,
+ "max_depth_eval": 80,
+ "min_depth": 1e-3,
+ "max_depth": 80,
+ },
+}
+
+ALL_INDOOR = ["nyu", "ibims", "sunrgbd", "diode_indoor", "hypersim_test"]
+ALL_OUTDOOR = ["kitti", "diml_outdoor", "diode_outdoor", "vkitti2", "ddad"]
+ALL_EVAL_DATASETS = ALL_INDOOR + ALL_OUTDOOR
+
+COMMON_TRAINING_CONFIG = {
+ "dataset": "nyu",
+ "distributed": True,
+ "workers": 16,
+ "clip_grad": 0.1,
+ "use_shared_dict": False,
+ "shared_dict": None,
+ "use_amp": False,
+
+ "aug": True,
+ "random_crop": False,
+ "random_translate": False,
+ "translate_prob": 0.2,
+ "max_translation": 100,
+
+ "validate_every": 0.25,
+ "log_images_every": 0.1,
+ "prefetch": False,
+}
+
+
+def flatten(config, except_keys=('bin_conf')):
+ def recurse(inp):
+ if isinstance(inp, dict):
+ for key, value in inp.items():
+ if key in except_keys:
+ yield (key, value)
+ if isinstance(value, dict):
+ yield from recurse(value)
+ else:
+ yield (key, value)
+
+ return dict(list(recurse(config)))
+
+
+def split_combined_args(kwargs):
+ """Splits the arguments that are combined with '__' into multiple arguments.
+ Combined arguments should have equal number of keys and values.
+ Keys are separated by '__' and Values are separated with ';'.
+ For example, '__n_bins__lr=256;0.001'
+
+ Args:
+ kwargs (dict): key-value pairs of arguments where key-value is optionally combined according to the above format.
+
+ Returns:
+ dict: Parsed dict with the combined arguments split into individual key-value pairs.
+ """
+ new_kwargs = dict(kwargs)
+ for key, value in kwargs.items():
+ if key.startswith("__"):
+ keys = key.split("__")[1:]
+ values = value.split(";")
+ assert len(keys) == len(
+ values), f"Combined arguments should have equal number of keys and values. Keys are separated by '__' and Values are separated with ';'. For example, '__n_bins__lr=256;0.001. Given (keys,values) is ({keys}, {values})"
+ for k, v in zip(keys, values):
+ new_kwargs[k] = v
+ return new_kwargs
+
+
+def parse_list(config, key, dtype=int):
+ """Parse a list of values for the key if the value is a string. The values are separated by a comma.
+ Modifies the config in place.
+ """
+ if key in config:
+ if isinstance(config[key], str):
+ config[key] = list(map(dtype, config[key].split(',')))
+ assert isinstance(config[key], list) and all([isinstance(e, dtype) for e in config[key]]
+ ), f"{key} should be a list of values dtype {dtype}. Given {config[key]} of type {type(config[key])} with values of type {[type(e) for e in config[key]]}."
+
+
+def get_model_config(model_name, model_version=None):
+ """Find and parse the .json config file for the model.
+
+ Args:
+ model_name (str): name of the model. The config file should be named config_{model_name}[_{model_version}].json under the models/{model_name} directory.
+ model_version (str, optional): Specific config version. If specified config_{model_name}_{model_version}.json is searched for and used. Otherwise config_{model_name}.json is used. Defaults to None.
+
+ Returns:
+ easydict: the config dictionary for the model.
+ """
+ config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json"
+ config_file = os.path.join(ROOT, "models", model_name, config_fname)
+ if not os.path.exists(config_file):
+ return None
+
+ with open(config_file, "r") as f:
+ config = edict(json.load(f))
+
+ # handle dictionary inheritance
+ # only training config is supported for inheritance
+ if "inherit" in config.train and config.train.inherit is not None:
+ inherit_config = get_model_config(config.train["inherit"]).train
+ for key, value in inherit_config.items():
+ if key not in config.train:
+ config.train[key] = value
+ return edict(config)
+
+
+def update_model_config(config, mode, model_name, model_version=None, strict=False):
+ model_config = get_model_config(model_name, model_version)
+ if model_config is not None:
+ config = {**config, **
+ flatten({**model_config.model, **model_config[mode]})}
+ elif strict:
+ raise ValueError(f"Config file for model {model_name} not found.")
+ return config
+
+
+def check_choices(name, value, choices):
+ # return # No checks in dev branch
+ if value not in choices:
+ raise ValueError(f"{name} {value} not in supported choices {choices}")
+
+
+KEYS_TYPE_BOOL = ["use_amp", "distributed", "use_shared_dict", "same_lr", "aug", "three_phase",
+ "prefetch", "cycle_momentum"] # Casting is not necessary as their int casted values in config are 0 or 1
+
+
+def get_config(model_name, mode='train', dataset=None, **overwrite_kwargs):
+ """Main entry point to get the config for the model.
+
+ Args:
+ model_name (str): name of the desired model.
+ mode (str, optional): "train" or "infer". Defaults to 'train'.
+ dataset (str, optional): If specified, the corresponding dataset configuration is loaded as well. Defaults to None.
+
+ Keyword Args: key-value pairs of arguments to overwrite the default config.
+
+ The order of precedence for overwriting the config is (Higher precedence first):
+ # 1. overwrite_kwargs
+ # 2. "config_version": Config file version if specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{config_version}.json
+ # 3. "version_name": Default Model version specific config specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{version_name}.json
+ # 4. common_config: Default config for all models specified in COMMON_CONFIG
+
+ Returns:
+ easydict: The config dictionary for the model.
+ """
+
+
+ check_choices("Model", model_name, ["zoedepth", "zoedepth_nk"])
+ check_choices("Mode", mode, ["train", "infer", "eval"])
+ if mode == "train":
+ check_choices("Dataset", dataset, ["nyu", "kitti", "mix", None])
+
+ config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG})
+ config = update_model_config(config, mode, model_name)
+
+ # update with model version specific config
+ version_name = overwrite_kwargs.get("version_name", config["version_name"])
+ config = update_model_config(config, mode, model_name, version_name)
+
+ # update with config version if specified
+ config_version = overwrite_kwargs.get("config_version", None)
+ if config_version is not None:
+ print("Overwriting config with config_version", config_version)
+ config = update_model_config(config, mode, model_name, config_version)
+
+ # update with overwrite_kwargs
+ # Combined args are useful for hyperparameter search
+ overwrite_kwargs = split_combined_args(overwrite_kwargs)
+ config = {**config, **overwrite_kwargs}
+
+ # Casting to bool # TODO: Not necessary. Remove and test
+ for key in KEYS_TYPE_BOOL:
+ if key in config:
+ config[key] = bool(config[key])
+
+ # Model specific post processing of config
+ parse_list(config, "n_attractors")
+
+ # adjust n_bins for each bin configuration if bin_conf is given and n_bins is passed in overwrite_kwargs
+ if 'bin_conf' in config and 'n_bins' in overwrite_kwargs:
+ bin_conf = config['bin_conf'] # list of dicts
+ n_bins = overwrite_kwargs['n_bins']
+ new_bin_conf = []
+ for conf in bin_conf:
+ conf['n_bins'] = n_bins
+ new_bin_conf.append(conf)
+ config['bin_conf'] = new_bin_conf
+
+ if mode == "train":
+ orig_dataset = dataset
+ if dataset == "mix":
+ dataset = 'nyu' # Use nyu as default for mix. Dataset config is changed accordingly while loading the dataloader
+ if dataset is not None:
+ config['project'] = f"MonoDepth3-{orig_dataset}" # Set project for wandb
+
+ if dataset is not None:
+ config['dataset'] = dataset
+ config = {**DATASETS_CONFIG[dataset], **config}
+
+
+ config['model'] = model_name
+ typed_config = {k: infer_type(v) for k, v in config.items()}
+ # add hostname to config
+ config['hostname'] = platform.node()
+ return edict(typed_config)
+
+
+def change_dataset(config, new_dataset):
+ config.update(DATASETS_CONFIG[new_dataset])
+ return config
diff --git a/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/easydict/__init__.py b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/easydict/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0c83507325193c88566ae0c6aa1347dd41c411
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_controlnet_aux/zoe/zoedepth/utils/easydict/__init__.py
@@ -0,0 +1,158 @@
+"""
+EasyDict
+Copy/pasted from https://github.com/makinacorpus/easydict
+Original author: Mathieu Leplatre
+"""
+
+class EasyDict(dict):
+ """
+ Get attributes
+
+ >>> d = EasyDict({'foo':3})
+ >>> d['foo']
+ 3
+ >>> d.foo
+ 3
+ >>> d.bar
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'EasyDict' object has no attribute 'bar'
+
+ Works recursively
+
+ >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
+ >>> isinstance(d.bar, dict)
+ True
+ >>> d.bar.x
+ 1
+
+ Bullet-proof
+
+ >>> EasyDict({})
+ {}
+ >>> EasyDict(d={})
+ {}
+ >>> EasyDict(None)
+ {}
+ >>> d = {'a': 1}
+ >>> EasyDict(**d)
+ {'a': 1}
+ >>> EasyDict((('a', 1), ('b', 2)))
+ {'a': 1, 'b': 2}
+
+ Set attributes
+
+ >>> d = EasyDict()
+ >>> d.foo = 3
+ >>> d.foo
+ 3
+ >>> d.bar = {'prop': 'value'}
+ >>> d.bar.prop
+ 'value'
+ >>> d
+ {'foo': 3, 'bar': {'prop': 'value'}}
+ >>> d.bar.prop = 'newer'
+ >>> d.bar.prop
+ 'newer'
+
+
+ Values extraction
+
+ >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
+ >>> isinstance(d.bar, list)
+ True
+ >>> from operator import attrgetter
+ >>> list(map(attrgetter('x'), d.bar))
+ [1, 3]
+ >>> list(map(attrgetter('y'), d.bar))
+ [2, 4]
+ >>> d = EasyDict()
+ >>> list(d.keys())
+ []
+ >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
+ >>> d.foo
+ 3
+ >>> d.bar.x
+ 1
+
+ Still like a dict though
+
+ >>> o = EasyDict({'clean':True})
+ >>> list(o.items())
+ [('clean', True)]
+
+ And like a class
+
+ >>> class Flower(EasyDict):
+ ... power = 1
+ ...
+ >>> f = Flower()
+ >>> f.power
+ 1
+ >>> f = Flower({'height': 12})
+ >>> f.height
+ 12
+ >>> f['power']
+ 1
+ >>> sorted(f.keys())
+ ['height', 'power']
+
+ update and pop items
+ >>> d = EasyDict(a=1, b='2')
+ >>> e = EasyDict(c=3.0, a=9.0)
+ >>> d.update(e)
+ >>> d.c
+ 3.0
+ >>> d['c']
+ 3.0
+ >>> d.get('c')
+ 3.0
+ >>> d.update(a=4, b=4)
+ >>> d.b
+ 4
+ >>> d.pop('a')
+ 4
+ >>> d.a
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'EasyDict' object has no attribute 'a'
+ """
+ def __init__(self, d=None, **kwargs):
+ if d is None:
+ d = {}
+ else:
+ d = dict(d)
+ if kwargs:
+ d.update(**kwargs)
+ for k, v in d.items():
+ setattr(self, k, v)
+ # Class attributes
+ for k in self.__class__.__dict__.keys():
+ if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
+ setattr(self, k, getattr(self, k))
+
+ def __setattr__(self, name, value):
+ if isinstance(value, (list, tuple)):
+ value = [self.__class__(x)
+ if isinstance(x, dict) else x for x in value]
+ elif isinstance(value, dict) and not isinstance(value, self.__class__):
+ value = self.__class__(value)
+ super(EasyDict, self).__setattr__(name, value)
+ super(EasyDict, self).__setitem__(name, value)
+
+ __setitem__ = __setattr__
+
+ def update(self, e=None, **f):
+ d = e or dict()
+ d.update(f)
+ for k in d:
+ setattr(self, k, d[k])
+
+ def pop(self, k, d=None):
+ delattr(self, k)
+ return super(EasyDict, self).pop(k, d)
+
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()
\ No newline at end of file
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7004e0fe43d772542d5cd74ee4fcd66b28949853
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .utils.env import setup_environment
+
+setup_environment()
+
+
+# This line will be programatically read/write by setup.py.
+# Leave them at the bottom of this file and don't touch them.
+__version__ = "0.6"
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5bee26d20fafa6554df1b6b2e54b49e44feba4c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# File:
+
+
+from . import catalog as _UNUSED # register the handler
+from .detection_checkpoint import DetectionCheckpointer
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+__all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/c2_model_loading.py b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/c2_model_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..047ec29f1e2521f05eb1b04a9b5b5620e903819b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/c2_model_loading.py
@@ -0,0 +1,412 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import re
+from typing import Dict, List
+import torch
+from tabulate import tabulate
+
+
+def convert_basic_c2_names(original_keys):
+ """
+ Apply some basic name conversion to names in C2 weights.
+ It only deals with typical backbone models.
+
+ Args:
+ original_keys (list[str]):
+ Returns:
+ list[str]: The same number of strings matching those in original_keys.
+ """
+ layer_keys = copy.deepcopy(original_keys)
+ layer_keys = [
+ {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys
+ ] # some hard-coded mappings
+
+ layer_keys = [k.replace("_", ".") for k in layer_keys]
+ layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys]
+ layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys]
+ # Uniform both bn and gn names to "norm"
+ layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys]
+ layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys]
+ layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys]
+ layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys]
+
+ # stem
+ layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys]
+ # to avoid mis-matching with "conv1" in other components (e.g. detection head)
+ layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys]
+
+ # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5)
+ # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys]
+ # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys]
+ # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys]
+ # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys]
+
+ # blocks
+ layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys]
+ layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys]
+ layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys]
+ layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys]
+
+ # DensePose substitutions
+ layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys]
+ layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys]
+ layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys]
+ layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys]
+ layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys]
+ return layer_keys
+
+
+def convert_c2_detectron_names(weights):
+ """
+ Map Caffe2 Detectron weight names to Detectron2 names.
+
+ Args:
+ weights (dict): name -> tensor
+
+ Returns:
+ dict: detectron2 names -> tensor
+ dict: detectron2 names -> C2 names
+ """
+ logger = logging.getLogger(__name__)
+ logger.info("Renaming Caffe2 weights ......")
+ original_keys = sorted(weights.keys())
+ layer_keys = copy.deepcopy(original_keys)
+
+ layer_keys = convert_basic_c2_names(layer_keys)
+
+ # --------------------------------------------------------------------------
+ # RPN hidden representation conv
+ # --------------------------------------------------------------------------
+ # FPN case
+ # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then
+ # shared for all other levels, hence the appearance of "fpn2"
+ layer_keys = [
+ k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys
+ ]
+ # Non-FPN case
+ layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys]
+
+ # --------------------------------------------------------------------------
+ # RPN box transformation conv
+ # --------------------------------------------------------------------------
+ # FPN case (see note above about "fpn2")
+ layer_keys = [
+ k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas")
+ for k in layer_keys
+ ]
+ layer_keys = [
+ k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits")
+ for k in layer_keys
+ ]
+ # Non-FPN case
+ layer_keys = [
+ k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys
+ ]
+ layer_keys = [
+ k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits")
+ for k in layer_keys
+ ]
+
+ # --------------------------------------------------------------------------
+ # Fast R-CNN box head
+ # --------------------------------------------------------------------------
+ layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys]
+ layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys]
+ layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys]
+ layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys]
+ # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s
+ layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys]
+
+ # --------------------------------------------------------------------------
+ # FPN lateral and output convolutions
+ # --------------------------------------------------------------------------
+ def fpn_map(name):
+ """
+ Look for keys with the following patterns:
+ 1) Starts with "fpn.inner."
+ Example: "fpn.inner.res2.2.sum.lateral.weight"
+ Meaning: These are lateral pathway convolutions
+ 2) Starts with "fpn.res"
+ Example: "fpn.res2.2.sum.weight"
+ Meaning: These are FPN output convolutions
+ """
+ splits = name.split(".")
+ norm = ".norm" if "norm" in splits else ""
+ if name.startswith("fpn.inner."):
+ # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight']
+ stage = int(splits[2][len("res") :])
+ return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1])
+ elif name.startswith("fpn.res"):
+ # splits example: ['fpn', 'res2', '2', 'sum', 'weight']
+ stage = int(splits[1][len("res") :])
+ return "fpn_output{}{}.{}".format(stage, norm, splits[-1])
+ return name
+
+ layer_keys = [fpn_map(k) for k in layer_keys]
+
+ # --------------------------------------------------------------------------
+ # Mask R-CNN mask head
+ # --------------------------------------------------------------------------
+ # roi_heads.StandardROIHeads case
+ layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys]
+ layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys]
+ layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys]
+ # roi_heads.Res5ROIHeads case
+ layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys]
+
+ # --------------------------------------------------------------------------
+ # Keypoint R-CNN head
+ # --------------------------------------------------------------------------
+ # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX"
+ layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys]
+ layer_keys = [
+ k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys
+ ]
+ layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys]
+
+ # --------------------------------------------------------------------------
+ # Done with replacements
+ # --------------------------------------------------------------------------
+ assert len(set(layer_keys)) == len(layer_keys)
+ assert len(original_keys) == len(layer_keys)
+
+ new_weights = {}
+ new_keys_to_original_keys = {}
+ for orig, renamed in zip(original_keys, layer_keys):
+ new_keys_to_original_keys[renamed] = orig
+ if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."):
+ # remove the meaningless prediction weight for background class
+ new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1
+ new_weights[renamed] = weights[orig][new_start_idx:]
+ logger.info(
+ "Remove prediction weight for background class in {}. The shape changes from "
+ "{} to {}.".format(
+ renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape)
+ )
+ )
+ elif renamed.startswith("cls_score."):
+ # move weights of bg class from original index 0 to last index
+ logger.info(
+ "Move classification weights for background class in {} from index 0 to "
+ "index {}.".format(renamed, weights[orig].shape[0] - 1)
+ )
+ new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]])
+ else:
+ new_weights[renamed] = weights[orig]
+
+ return new_weights, new_keys_to_original_keys
+
+
+# Note the current matching is not symmetric.
+# it assumes model_state_dict will have longer names.
+def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True):
+ """
+ Match names between the two state-dict, and returns a new chkpt_state_dict with names
+ converted to match model_state_dict with heuristics. The returned dict can be later
+ loaded with fvcore checkpointer.
+ If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2
+ model and will be renamed at first.
+
+ Strategy: suppose that the models that we will create will have prefixes appended
+ to each of its keys, for example due to an extra level of nesting that the original
+ pre-trained weights from ImageNet won't contain. For example, model.state_dict()
+ might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
+ res2.conv1.weight. We thus want to match both parameters together.
+ For that, we look for each model weight, look among all loaded keys if there is one
+ that is a suffix of the current weight name, and use it if that's the case.
+ If multiple matches exist, take the one with longest size
+ of the corresponding name. For example, for the same model as before, the pretrained
+ weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
+ we want to match backbone[0].body.conv1.weight to conv1.weight, and
+ backbone[0].body.res2.conv1.weight to res2.conv1.weight.
+ """
+ model_keys = sorted(model_state_dict.keys())
+ if c2_conversion:
+ ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict)
+ # original_keys: the name in the original dict (before renaming)
+ else:
+ original_keys = {x: x for x in ckpt_state_dict.keys()}
+ ckpt_keys = sorted(ckpt_state_dict.keys())
+
+ def match(a, b):
+ # Matched ckpt_key should be a complete (starts with '.') suffix.
+ # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1,
+ # but matches whatever_conv1 or mesh_head.whatever_conv1.
+ return a == b or a.endswith("." + b)
+
+ # get a matrix of string matches, where each (i, j) entry correspond to the size of the
+ # ckpt_key string, if it matches
+ match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys]
+ match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys))
+ # use the matched one with longest size in case of multiple matches
+ max_match_size, idxs = match_matrix.max(1)
+ # remove indices that correspond to no-match
+ idxs[max_match_size == 0] = -1
+
+ logger = logging.getLogger(__name__)
+ # matched_pairs (matched checkpoint key --> matched model key)
+ matched_keys = {}
+ result_state_dict = {}
+ for idx_model, idx_ckpt in enumerate(idxs.tolist()):
+ if idx_ckpt == -1:
+ continue
+ key_model = model_keys[idx_model]
+ key_ckpt = ckpt_keys[idx_ckpt]
+ value_ckpt = ckpt_state_dict[key_ckpt]
+ shape_in_model = model_state_dict[key_model].shape
+
+ if shape_in_model != value_ckpt.shape:
+ logger.warning(
+ "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+ key_ckpt, value_ckpt.shape, key_model, shape_in_model
+ )
+ )
+ logger.warning(
+ "{} will not be loaded. Please double check and see if this is desired.".format(
+ key_ckpt
+ )
+ )
+ continue
+
+ assert key_model not in result_state_dict
+ result_state_dict[key_model] = value_ckpt
+ if key_ckpt in matched_keys: # already added to matched_keys
+ logger.error(
+ "Ambiguity found for {} in checkpoint!"
+ "It matches at least two keys in the model ({} and {}).".format(
+ key_ckpt, key_model, matched_keys[key_ckpt]
+ )
+ )
+ raise ValueError("Cannot match one checkpoint key to multiple keys in the model.")
+
+ matched_keys[key_ckpt] = key_model
+
+ # logging:
+ matched_model_keys = sorted(matched_keys.values())
+ if len(matched_model_keys) == 0:
+ logger.warning("No weights in checkpoint matched with model.")
+ return ckpt_state_dict
+ common_prefix = _longest_common_prefix(matched_model_keys)
+ rev_matched_keys = {v: k for k, v in matched_keys.items()}
+ original_keys = {k: original_keys[rev_matched_keys[k]] for k in matched_model_keys}
+
+ model_key_groups = _group_keys_by_module(matched_model_keys, original_keys)
+ table = []
+ memo = set()
+ for key_model in matched_model_keys:
+ if key_model in memo:
+ continue
+ if key_model in model_key_groups:
+ group = model_key_groups[key_model]
+ memo |= set(group)
+ shapes = [tuple(model_state_dict[k].shape) for k in group]
+ table.append(
+ (
+ _longest_common_prefix([k[len(common_prefix) :] for k in group]) + "*",
+ _group_str([original_keys[k] for k in group]),
+ " ".join([str(x).replace(" ", "") for x in shapes]),
+ )
+ )
+ else:
+ key_checkpoint = original_keys[key_model]
+ shape = str(tuple(model_state_dict[key_model].shape))
+ table.append((key_model[len(common_prefix) :], key_checkpoint, shape))
+ table_str = tabulate(
+ table, tablefmt="pipe", headers=["Names in Model", "Names in Checkpoint", "Shapes"]
+ )
+ logger.info(
+ "Following weights matched with "
+ + (f"submodule {common_prefix[:-1]}" if common_prefix else "model")
+ + ":\n"
+ + table_str
+ )
+
+ unmatched_ckpt_keys = [k for k in ckpt_keys if k not in set(matched_keys.keys())]
+ for k in unmatched_ckpt_keys:
+ result_state_dict[k] = ckpt_state_dict[k]
+ return result_state_dict
+
+
+def _group_keys_by_module(keys: List[str], original_names: Dict[str, str]):
+ """
+ Params in the same submodule are grouped together.
+
+ Args:
+ keys: names of all parameters
+ original_names: mapping from parameter name to their name in the checkpoint
+
+ Returns:
+ dict[name -> all other names in the same group]
+ """
+
+ def _submodule_name(key):
+ pos = key.rfind(".")
+ if pos < 0:
+ return None
+ prefix = key[: pos + 1]
+ return prefix
+
+ all_submodules = [_submodule_name(k) for k in keys]
+ all_submodules = [x for x in all_submodules if x]
+ all_submodules = sorted(all_submodules, key=len)
+
+ ret = {}
+ for prefix in all_submodules:
+ group = [k for k in keys if k.startswith(prefix)]
+ if len(group) <= 1:
+ continue
+ original_name_lcp = _longest_common_prefix_str([original_names[k] for k in group])
+ if len(original_name_lcp) == 0:
+ # don't group weights if original names don't share prefix
+ continue
+
+ for k in group:
+ if k in ret:
+ continue
+ ret[k] = group
+ return ret
+
+
+def _longest_common_prefix(names: List[str]) -> str:
+ """
+ ["abc.zfg", "abc.zef"] -> "abc."
+ """
+ names = [n.split(".") for n in names]
+ m1, m2 = min(names), max(names)
+ ret = [a for a, b in zip(m1, m2) if a == b]
+ ret = ".".join(ret) + "." if len(ret) else ""
+ return ret
+
+
+def _longest_common_prefix_str(names: List[str]) -> str:
+ m1, m2 = min(names), max(names)
+ lcp = []
+ for a, b in zip(m1, m2):
+ if a == b:
+ lcp.append(a)
+ else:
+ break
+ lcp = "".join(lcp)
+ return lcp
+
+
+def _group_str(names: List[str]) -> str:
+ """
+ Turn "common1", "common2", "common3" into "common{1,2,3}"
+ """
+ lcp = _longest_common_prefix_str(names)
+ rest = [x[len(lcp) :] for x in names]
+ rest = "{" + ",".join(rest) + "}"
+ ret = lcp + rest
+
+ # add some simplification for BN specifically
+ ret = ret.replace("bn_{beta,running_mean,running_var,gamma}", "bn_*")
+ ret = ret.replace("bn_beta,bn_running_mean,bn_running_var,bn_gamma", "bn_*")
+ return ret
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/catalog.py b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..7426c3c8e43009073a2e2e3abf5d9b82856159e8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/catalog.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+
+from custom_detectron2.utils.file_io import PathHandler, PathManager
+
+class ModelCatalog(object):
+ """
+ Store mappings from names to third-party models.
+ """
+
+ S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
+
+ # MSRA models have STRIDE_IN_1X1=True. False otherwise.
+ # NOTE: all BN models here have fused BN into an affine layer.
+ # As a result, you should only load them to a model with "FrozenBN".
+ # Loading them to a model with regular BN or SyncBN is wrong.
+ # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
+ # which should be negligible for training.
+ # NOTE: all models here uses PIXEL_STD=[1,1,1]
+ # NOTE: Most of the BN models here are no longer used. We use the
+ # re-converted pre-trained models under detectron2 model zoo instead.
+ C2_IMAGENET_MODELS = {
+ "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+ "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+ "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+ "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+ "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+ "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+ "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
+ }
+
+ C2_DETECTRON_PATH_FORMAT = (
+ "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl" # noqa B950
+ )
+
+ C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
+ C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
+
+ # format: {model_name} -> part of the url
+ C2_DETECTRON_MODELS = {
+ "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW", # noqa B950
+ "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I", # noqa B950
+ "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7", # noqa B950
+ "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ", # noqa B950
+ "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB", # noqa B950
+ "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC", # noqa B950
+ "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT", # noqa B950
+ "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI", # noqa B950
+ "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q", # noqa B950
+ "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao", # noqa B950
+ "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L", # noqa B950
+ "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179", # noqa B950
+ "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2", # noqa B950
+ }
+
+ @staticmethod
+ def get(name):
+ if name.startswith("Caffe2Detectron/COCO"):
+ return ModelCatalog._get_c2_detectron_baseline(name)
+ if name.startswith("ImageNetPretrained/"):
+ return ModelCatalog._get_c2_imagenet_pretrained(name)
+ raise RuntimeError("model not present in the catalog: {}".format(name))
+
+ @staticmethod
+ def _get_c2_imagenet_pretrained(name):
+ prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
+ name = name[len("ImageNetPretrained/") :]
+ name = ModelCatalog.C2_IMAGENET_MODELS[name]
+ url = "/".join([prefix, name])
+ return url
+
+ @staticmethod
+ def _get_c2_detectron_baseline(name):
+ name = name[len("Caffe2Detectron/COCO/") :]
+ url = ModelCatalog.C2_DETECTRON_MODELS[name]
+ if "keypoint_rcnn" in name:
+ dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
+ else:
+ dataset = ModelCatalog.C2_DATASET_COCO
+
+ if "35998355/rpn_R-50-C4_1x" in name:
+ # this one model is somehow different from others ..
+ type = "rpn"
+ else:
+ type = "generalized_rcnn"
+
+ # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
+ url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
+ prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
+ )
+ return url
+
+
+class ModelCatalogHandler(PathHandler):
+ """
+ Resolve URL like catalog://.
+ """
+
+ PREFIX = "catalog://"
+
+ def _get_supported_prefixes(self):
+ return [self.PREFIX]
+
+ def _get_local_path(self, path, **kwargs):
+ logger = logging.getLogger(__name__)
+ catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
+ logger.info("Catalog entry {} points to {}".format(path, catalog_path))
+ return PathManager.get_local_path(catalog_path, **kwargs)
+
+ def _open(self, path, mode="r", **kwargs):
+ return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(ModelCatalogHandler())
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/detection_checkpoint.py b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/detection_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d43fc0194e5fc0a781c8772809d400f4734a9e0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/checkpoint/detection_checkpoint.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+import pickle
+from urllib.parse import parse_qs, urlparse
+import torch
+from fvcore.common.checkpoint import Checkpointer
+from torch.nn.parallel import DistributedDataParallel
+
+import custom_detectron2.utils.comm as comm
+from custom_detectron2.utils.file_io import PathManager
+
+from .c2_model_loading import align_and_update_state_dicts
+
+
+class DetectionCheckpointer(Checkpointer):
+ """
+ Same as :class:`Checkpointer`, but is able to:
+ 1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models.
+ 2. correctly load checkpoints that are only available on the master worker
+ """
+
+ def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+ is_main_process = comm.is_main_process()
+ super().__init__(
+ model,
+ save_dir,
+ save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
+ **checkpointables,
+ )
+ self.path_manager = PathManager
+ self._parsed_url_during_load = None
+
+ def load(self, path, *args, **kwargs):
+ assert self._parsed_url_during_load is None
+ need_sync = False
+ logger = logging.getLogger(__name__)
+ logger.info("[DetectionCheckpointer] Loading from {} ...".format(path))
+
+ if path and isinstance(self.model, DistributedDataParallel):
+ path = self.path_manager.get_local_path(path)
+ has_file = os.path.isfile(path)
+ all_has_file = comm.all_gather(has_file)
+ if not all_has_file[0]:
+ raise OSError(f"File {path} not found on main worker.")
+ if not all(all_has_file):
+ logger.warning(
+ f"Not all workers can read checkpoint {path}. "
+ "Training may fail to fully resume."
+ )
+ # TODO: broadcast the checkpoint file contents from main
+ # worker, and load from it instead.
+ need_sync = True
+ if not has_file:
+ path = None # don't load if not readable
+
+ if path:
+ parsed_url = urlparse(path)
+ self._parsed_url_during_load = parsed_url
+ path = parsed_url._replace(query="").geturl() # remove query from filename
+ path = self.path_manager.get_local_path(path)
+
+ self.logger.setLevel('CRITICAL')
+ ret = super().load(path, *args, **kwargs)
+
+ if need_sync:
+ logger.info("Broadcasting model states from main worker ...")
+ self.model._sync_params_and_buffers()
+ self._parsed_url_during_load = None # reset to None
+ return ret
+
+ def _load_file(self, filename):
+ if filename.endswith(".pkl"):
+ with PathManager.open(filename, "rb") as f:
+ data = pickle.load(f, encoding="latin1")
+ if "model" in data and "__author__" in data:
+ # file is in Detectron2 model zoo format
+ self.logger.info("Reading a file from '{}'".format(data["__author__"]))
+ return data
+ else:
+ # assume file is from Caffe2 / Detectron1 model zoo
+ if "blobs" in data:
+ # Detection models have "blobs", but ImageNet models don't
+ data = data["blobs"]
+ data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
+ return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
+ elif filename.endswith(".pyth"):
+ # assume file is from pycls; no one else seems to use the ".pyth" extension
+ with PathManager.open(filename, "rb") as f:
+ data = torch.load(f)
+ assert (
+ "model_state" in data
+ ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'."
+ model_state = {
+ k: v
+ for k, v in data["model_state"].items()
+ if not k.endswith("num_batches_tracked")
+ }
+ return {"model": model_state, "__author__": "pycls", "matching_heuristics": True}
+
+ loaded = self._torch_load(filename)
+ if "model" not in loaded:
+ loaded = {"model": loaded}
+ assert self._parsed_url_during_load is not None, "`_load_file` must be called inside `load`"
+ parsed_url = self._parsed_url_during_load
+ queries = parse_qs(parsed_url.query)
+ if queries.pop("matching_heuristics", "False") == ["True"]:
+ loaded["matching_heuristics"] = True
+ if len(queries) > 0:
+ raise ValueError(
+ f"Unsupported query remaining: f{queries}, orginal filename: {parsed_url.geturl()}"
+ )
+ return loaded
+
+ def _torch_load(self, f):
+ return super()._load_file(f)
+
+ def _load_model(self, checkpoint):
+ if checkpoint.get("matching_heuristics", False):
+ self._convert_ndarray_to_tensor(checkpoint["model"])
+ # convert weights by name-matching heuristics
+ checkpoint["model"] = align_and_update_state_dicts(
+ self.model.state_dict(),
+ checkpoint["model"],
+ c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
+ )
+ # for non-caffe2 models, use standard ways to load it
+ incompatible = super()._load_model(checkpoint)
+
+ model_buffers = dict(self.model.named_buffers(recurse=False))
+ for k in ["pixel_mean", "pixel_std"]:
+ # Ignore missing key message about pixel_mean/std.
+ # Though they may be missing in old checkpoints, they will be correctly
+ # initialized from config anyway.
+ if k in model_buffers:
+ try:
+ incompatible.missing_keys.remove(k)
+ except ValueError:
+ pass
+ for k in incompatible.unexpected_keys[:]:
+ # Ignore unexpected keys about cell anchors. They exist in old checkpoints
+ # but now they are non-persistent buffers and will not be in new checkpoints.
+ if "anchor_generator.cell_anchors" in k:
+ incompatible.unexpected_keys.remove(k)
+ return incompatible
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/config/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b209d8ec84bbe177958920c3b53127ae42e5b68e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/config/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .compat import downgrade_config, upgrade_config
+from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable
+from .instantiate import instantiate
+from .lazy import LazyCall, LazyConfig
+
+__all__ = [
+ "CfgNode",
+ "get_cfg",
+ "global_cfg",
+ "set_global_cfg",
+ "downgrade_config",
+ "upgrade_config",
+ "configurable",
+ "instantiate",
+ "LazyCall",
+ "LazyConfig",
+]
+
+
+from custom_detectron2.utils.env import fixup_module_metadata
+
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/config/compat.py b/comfyui_controlnet_aux/src/custom_detectron2/config/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad60c9360c2474d07f4c0d17ec5f37fb0dce36cd
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/config/compat.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Backward compatibility of configs.
+
+Instructions to bump version:
++ It's not needed to bump version if new keys are added.
+ It's only needed when backward-incompatible changes happen
+ (i.e., some existing keys disappear, or the meaning of a key changes)
++ To bump version, do the following:
+ 1. Increment _C.VERSION in defaults.py
+ 2. Add a converter in this file.
+
+ Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X,
+ and a function "downgrade" which in-place downgrades config from X to X-1
+
+ In each function, VERSION is left unchanged.
+
+ Each converter assumes that its input has the relevant keys
+ (i.e., the input is not a partial config).
+ 3. Run the tests (test_config.py) to make sure the upgrade & downgrade
+ functions are consistent.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+from .config import CfgNode as CN
+from .defaults import _C
+
+__all__ = ["upgrade_config", "downgrade_config"]
+
+
+def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN:
+ """
+ Upgrade a config from its current version to a newer version.
+
+ Args:
+ cfg (CfgNode):
+ to_version (int): defaults to the latest version.
+ """
+ cfg = cfg.clone()
+ if to_version is None:
+ to_version = _C.VERSION
+
+ assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format(
+ cfg.VERSION, to_version
+ )
+ for k in range(cfg.VERSION, to_version):
+ converter = globals()["ConverterV" + str(k + 1)]
+ converter.upgrade(cfg)
+ cfg.VERSION = k + 1
+ return cfg
+
+
+def downgrade_config(cfg: CN, to_version: int) -> CN:
+ """
+ Downgrade a config from its current version to an older version.
+
+ Args:
+ cfg (CfgNode):
+ to_version (int):
+
+ Note:
+ A general downgrade of arbitrary configs is not always possible due to the
+ different functionalities in different versions.
+ The purpose of downgrade is only to recover the defaults in old versions,
+ allowing it to load an old partial yaml config.
+ Therefore, the implementation only needs to fill in the default values
+ in the old version when a general downgrade is not possible.
+ """
+ cfg = cfg.clone()
+ assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format(
+ cfg.VERSION, to_version
+ )
+ for k in range(cfg.VERSION, to_version, -1):
+ converter = globals()["ConverterV" + str(k)]
+ converter.downgrade(cfg)
+ cfg.VERSION = k - 1
+ return cfg
+
+
+def guess_version(cfg: CN, filename: str) -> int:
+ """
+ Guess the version of a partial config where the VERSION field is not specified.
+ Returns the version, or the latest if cannot make a guess.
+
+ This makes it easier for users to migrate.
+ """
+ logger = logging.getLogger(__name__)
+
+ def _has(name: str) -> bool:
+ cur = cfg
+ for n in name.split("."):
+ if n not in cur:
+ return False
+ cur = cur[n]
+ return True
+
+ # Most users' partial configs have "MODEL.WEIGHT", so guess on it
+ ret = None
+ if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"):
+ ret = 1
+
+ if ret is not None:
+ logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret))
+ else:
+ ret = _C.VERSION
+ logger.warning(
+ "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format(
+ filename, ret
+ )
+ )
+ return ret
+
+
+def _rename(cfg: CN, old: str, new: str) -> None:
+ old_keys = old.split(".")
+ new_keys = new.split(".")
+
+ def _set(key_seq: List[str], val: str) -> None:
+ cur = cfg
+ for k in key_seq[:-1]:
+ if k not in cur:
+ cur[k] = CN()
+ cur = cur[k]
+ cur[key_seq[-1]] = val
+
+ def _get(key_seq: List[str]) -> CN:
+ cur = cfg
+ for k in key_seq:
+ cur = cur[k]
+ return cur
+
+ def _del(key_seq: List[str]) -> None:
+ cur = cfg
+ for k in key_seq[:-1]:
+ cur = cur[k]
+ del cur[key_seq[-1]]
+ if len(cur) == 0 and len(key_seq) > 1:
+ _del(key_seq[:-1])
+
+ _set(new_keys, _get(old_keys))
+ _del(old_keys)
+
+
+class _RenameConverter:
+ """
+ A converter that handles simple rename.
+ """
+
+ RENAME: List[Tuple[str, str]] = [] # list of tuples of (old name, new name)
+
+ @classmethod
+ def upgrade(cls, cfg: CN) -> None:
+ for old, new in cls.RENAME:
+ _rename(cfg, old, new)
+
+ @classmethod
+ def downgrade(cls, cfg: CN) -> None:
+ for old, new in cls.RENAME[::-1]:
+ _rename(cfg, new, old)
+
+
+class ConverterV1(_RenameConverter):
+ RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")]
+
+
+class ConverterV2(_RenameConverter):
+ """
+ A large bulk of rename, before public release.
+ """
+
+ RENAME = [
+ ("MODEL.WEIGHT", "MODEL.WEIGHTS"),
+ ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"),
+ ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"),
+ ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"),
+ ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"),
+ (
+ "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD",
+ "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH",
+ ),
+ (
+ "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT",
+ "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT",
+ ),
+ (
+ "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD",
+ "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH",
+ ),
+ ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"),
+ ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"),
+ ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"),
+ ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"),
+ ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"),
+ ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"),
+ ("TEST.AUG_ON", "TEST.AUG.ENABLED"),
+ ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"),
+ ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"),
+ ("TEST.AUG_FLIP", "TEST.AUG.FLIP"),
+ ]
+
+ @classmethod
+ def upgrade(cls, cfg: CN) -> None:
+ super().upgrade(cfg)
+
+ if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
+ _rename(
+ cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS"
+ )
+ _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+ del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"]
+ del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"]
+ else:
+ _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS")
+ _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+ del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"]
+ del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"]
+ del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"]
+
+ @classmethod
+ def downgrade(cls, cfg: CN) -> None:
+ super().downgrade(cfg)
+
+ _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS")
+ _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES")
+ cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS
+ cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES
+ cfg.MODEL.RETINANET.ANCHOR_STRIDES = [] # this is not used anywhere in any version
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/config/config.py b/comfyui_controlnet_aux/src/custom_detectron2/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..926dfd568a17d2824258c71a25df17994b00fe7e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/config/config.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import functools
+import inspect
+import logging
+from fvcore.common.config import CfgNode as _CfgNode
+
+from custom_detectron2.utils.file_io import PathManager
+
+
+class CfgNode(_CfgNode):
+ """
+ The same as `fvcore.common.config.CfgNode`, but different in:
+
+ 1. Use unsafe yaml loading by default.
+ Note that this may lead to arbitrary code execution: you must not
+ load a config file from untrusted sources before manually inspecting
+ the content of the file.
+ 2. Support config versioning.
+ When attempting to merge an old config, it will convert the old config automatically.
+
+ .. automethod:: clone
+ .. automethod:: freeze
+ .. automethod:: defrost
+ .. automethod:: is_frozen
+ .. automethod:: load_yaml_with_base
+ .. automethod:: merge_from_list
+ .. automethod:: merge_from_other_cfg
+ """
+
+ @classmethod
+ def _open_cfg(cls, filename):
+ return PathManager.open(filename, "r")
+
+ # Note that the default value of allow_unsafe is changed to True
+ def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None:
+ """
+ Load content from the given config file and merge it into self.
+
+ Args:
+ cfg_filename: config filename
+ allow_unsafe: allow unsafe yaml syntax
+ """
+ assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!"
+ loaded_cfg = self.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe)
+ loaded_cfg = type(self)(loaded_cfg)
+
+ # defaults.py needs to import CfgNode
+ from .defaults import _C
+
+ latest_ver = _C.VERSION
+ assert (
+ latest_ver == self.VERSION
+ ), "CfgNode.merge_from_file is only allowed on a config object of latest version!"
+
+ logger = logging.getLogger(__name__)
+
+ loaded_ver = loaded_cfg.get("VERSION", None)
+ if loaded_ver is None:
+ from .compat import guess_version
+
+ loaded_ver = guess_version(loaded_cfg, cfg_filename)
+ assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format(
+ loaded_ver, self.VERSION
+ )
+
+ if loaded_ver == self.VERSION:
+ self.merge_from_other_cfg(loaded_cfg)
+ else:
+ # compat.py needs to import CfgNode
+ from .compat import upgrade_config, downgrade_config
+
+ logger.warning(
+ "Loading an old v{} config file '{}' by automatically upgrading to v{}. "
+ "See docs/CHANGELOG.md for instructions to update your files.".format(
+ loaded_ver, cfg_filename, self.VERSION
+ )
+ )
+ # To convert, first obtain a full config at an old version
+ old_self = downgrade_config(self, to_version=loaded_ver)
+ old_self.merge_from_other_cfg(loaded_cfg)
+ new_config = upgrade_config(old_self)
+ self.clear()
+ self.update(new_config)
+
+ def dump(self, *args, **kwargs):
+ """
+ Returns:
+ str: a yaml string representation of the config
+ """
+ # to make it show up in docs
+ return super().dump(*args, **kwargs)
+
+
+global_cfg = CfgNode()
+
+
+def get_cfg() -> CfgNode:
+ """
+ Get a copy of the default config.
+
+ Returns:
+ a detectron2 CfgNode instance.
+ """
+ from .defaults import _C
+
+ return _C.clone()
+
+
+def set_global_cfg(cfg: CfgNode) -> None:
+ """
+ Let the global config point to the given cfg.
+
+ Assume that the given "cfg" has the key "KEY", after calling
+ `set_global_cfg(cfg)`, the key can be accessed by:
+ ::
+ from custom_detectron2.config import global_cfg
+ print(global_cfg.KEY)
+
+ By using a hacky global config, you can access these configs anywhere,
+ without having to pass the config object or the values deep into the code.
+ This is a hacky feature introduced for quick prototyping / research exploration.
+ """
+ global global_cfg
+ global_cfg.clear()
+ global_cfg.update(cfg)
+
+
+def configurable(init_func=None, *, from_config=None):
+ """
+ Decorate a function or a class's __init__ method so that it can be called
+ with a :class:`CfgNode` object using a :func:`from_config` function that translates
+ :class:`CfgNode` to arguments.
+
+ Examples:
+ ::
+ # Usage 1: Decorator on __init__:
+ class A:
+ @configurable
+ def __init__(self, a, b=2, c=3):
+ pass
+
+ @classmethod
+ def from_config(cls, cfg): # 'cfg' must be the first argument
+ # Returns kwargs to be passed to __init__
+ return {"a": cfg.A, "b": cfg.B}
+
+ a1 = A(a=1, b=2) # regular construction
+ a2 = A(cfg) # construct with a cfg
+ a3 = A(cfg, b=3, c=4) # construct with extra overwrite
+
+ # Usage 2: Decorator on any function. Needs an extra from_config argument:
+ @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B})
+ def a_func(a, b=2, c=3):
+ pass
+
+ a1 = a_func(a=1, b=2) # regular call
+ a2 = a_func(cfg) # call with a cfg
+ a3 = a_func(cfg, b=3, c=4) # call with extra overwrite
+
+ Args:
+ init_func (callable): a class's ``__init__`` method in usage 1. The
+ class must have a ``from_config`` classmethod which takes `cfg` as
+ the first argument.
+ from_config (callable): the from_config function in usage 2. It must take `cfg`
+ as its first argument.
+ """
+
+ if init_func is not None:
+ assert (
+ inspect.isfunction(init_func)
+ and from_config is None
+ and init_func.__name__ == "__init__"
+ ), "Incorrect use of @configurable. Check API documentation for examples."
+
+ @functools.wraps(init_func)
+ def wrapped(self, *args, **kwargs):
+ try:
+ from_config_func = type(self).from_config
+ except AttributeError as e:
+ raise AttributeError(
+ "Class with @configurable must have a 'from_config' classmethod."
+ ) from e
+ if not inspect.ismethod(from_config_func):
+ raise TypeError("Class with @configurable must have a 'from_config' classmethod.")
+
+ if _called_with_cfg(*args, **kwargs):
+ explicit_args = _get_args_from_config(from_config_func, *args, **kwargs)
+ init_func(self, **explicit_args)
+ else:
+ init_func(self, *args, **kwargs)
+
+ return wrapped
+
+ else:
+ if from_config is None:
+ return configurable # @configurable() is made equivalent to @configurable
+ assert inspect.isfunction(
+ from_config
+ ), "from_config argument of configurable must be a function!"
+
+ def wrapper(orig_func):
+ @functools.wraps(orig_func)
+ def wrapped(*args, **kwargs):
+ if _called_with_cfg(*args, **kwargs):
+ explicit_args = _get_args_from_config(from_config, *args, **kwargs)
+ return orig_func(**explicit_args)
+ else:
+ return orig_func(*args, **kwargs)
+
+ wrapped.from_config = from_config
+ return wrapped
+
+ return wrapper
+
+
+def _get_args_from_config(from_config_func, *args, **kwargs):
+ """
+ Use `from_config` to obtain explicit arguments.
+
+ Returns:
+ dict: arguments to be used for cls.__init__
+ """
+ signature = inspect.signature(from_config_func)
+ if list(signature.parameters.keys())[0] != "cfg":
+ if inspect.isfunction(from_config_func):
+ name = from_config_func.__name__
+ else:
+ name = f"{from_config_func.__self__}.from_config"
+ raise TypeError(f"{name} must take 'cfg' as the first argument!")
+ support_var_arg = any(
+ param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD]
+ for param in signature.parameters.values()
+ )
+ if support_var_arg: # forward all arguments to from_config, if from_config accepts them
+ ret = from_config_func(*args, **kwargs)
+ else:
+ # forward supported arguments to from_config
+ supported_arg_names = set(signature.parameters.keys())
+ extra_kwargs = {}
+ for name in list(kwargs.keys()):
+ if name not in supported_arg_names:
+ extra_kwargs[name] = kwargs.pop(name)
+ ret = from_config_func(*args, **kwargs)
+ # forward the other arguments to __init__
+ ret.update(extra_kwargs)
+ return ret
+
+
+def _called_with_cfg(*args, **kwargs):
+ """
+ Returns:
+ bool: whether the arguments contain CfgNode and should be considered
+ forwarded to from_config.
+ """
+ from omegaconf import DictConfig
+
+ if len(args) and isinstance(args[0], (_CfgNode, DictConfig)):
+ return True
+ if isinstance(kwargs.pop("cfg", None), (_CfgNode, DictConfig)):
+ return True
+ # `from_config`'s first argument is forced to be "cfg".
+ # So the above check covers all cases.
+ return False
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/config/defaults.py b/comfyui_controlnet_aux/src/custom_detectron2/config/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ca185425170a823f623c7dae9abef07f49752f
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/config/defaults.py
@@ -0,0 +1,650 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .config import CfgNode as CN
+
+# NOTE: given the new config system
+# (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html),
+# we will stop adding new functionalities to default CfgNode.
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+# The version number, to upgrade from old configs to new ones if any
+# changes happen. It's recommended to keep a VERSION in your config file.
+_C.VERSION = 2
+
+_C.MODEL = CN()
+_C.MODEL.LOAD_PROPOSALS = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+# Path (a file path, or URL like detectron2://.., https://..) to a checkpoint file
+# to be loaded to the model. You can find available models in the model zoo.
+_C.MODEL.WEIGHTS = ""
+
+# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
+# To train on images of different number of channels, just set different mean & std.
+# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
+
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# By default, {MIN,MAX}_SIZE options are used in transforms.ResizeShortestEdge.
+# Please refer to ResizeShortestEdge for detailed definition.
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)
+# Sample size of smallest side by choice or random selection from range give by
+# INPUT.MIN_SIZE_TRAIN
+_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Mode for flipping images used in data augmentation during training
+# choose one of ["horizontal, "vertical", "none"]
+_C.INPUT.RANDOM_FLIP = "horizontal"
+
+# `True` if cropping is used for data augmentation during training
+_C.INPUT.CROP = CN({"ENABLED": False})
+# Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation.
+_C.INPUT.CROP.TYPE = "relative_range"
+# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
+# pixels if CROP.TYPE is "absolute"
+_C.INPUT.CROP.SIZE = [0.9, 0.9]
+
+
+# Whether the model needs RGB, YUV, HSV etc.
+# Should be one of the modes defined here, as we use PIL to read the image:
+# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
+# with BGR being the one exception. One can set image format to BGR, we will
+# internally use RGB for conversion and flip the channels over
+_C.INPUT.FORMAT = "BGR"
+# The ground truth mask format that the model will use.
+# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
+_C.INPUT.MASK_FORMAT = "polygon" # alternative: "bitmask"
+
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training. Must be registered in DatasetCatalog
+# Samples from these datasets will be merged and used as one dataset.
+_C.DATASETS.TRAIN = ()
+# List of the pre-computed proposal files for training, which must be consistent
+# with datasets listed in DATASETS.TRAIN.
+_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
+# Number of top scoring precomputed proposals to keep for training
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
+# List of the dataset names for testing. Must be registered in DatasetCatalog
+_C.DATASETS.TEST = ()
+# List of the pre-computed proposal files for test, which must be consistent
+# with datasets listed in DATASETS.TEST.
+_C.DATASETS.PROPOSAL_FILES_TEST = ()
+# Number of top scoring precomputed proposals to keep for test
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Options: TrainingSampler, RepeatFactorTrainingSampler
+_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
+# Repeat threshold for RepeatFactorTrainingSampler
+_C.DATALOADER.REPEAT_THRESHOLD = 0.0
+# Tf True, when working on datasets that have instance annotations, the
+# training dataloader will filter out images without associated annotations
+_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
+
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+_C.MODEL.BACKBONE.NAME = "build_resnet_backbone"
+# Freeze the first several stages so they are not trained.
+# There are 5 stages in ResNet. The first is a convolution, and the following
+# stages are each group of residual blocks.
+_C.MODEL.BACKBONE.FREEZE_AT = 2
+
+
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+# Names of the input feature maps to be used by FPN
+# They must have contiguous power of 2 strides
+# e.g., ["res2", "res3", "res4", "res5"]
+_C.MODEL.FPN.IN_FEATURES = []
+_C.MODEL.FPN.OUT_CHANNELS = 256
+
+# Options: "" (no norm), "GN"
+_C.MODEL.FPN.NORM = ""
+
+# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
+_C.MODEL.FPN.FUSE_TYPE = "sum"
+
+
+# ---------------------------------------------------------------------------- #
+# Proposal generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.PROPOSAL_GENERATOR = CN()
+# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
+_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+# Proposal height and width both need to be greater than MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
+
+
+# ---------------------------------------------------------------------------- #
+# Anchor generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ANCHOR_GENERATOR = CN()
+# The generator can be any name in the ANCHOR_GENERATOR registry
+_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
+# Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for
+# IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1.
+# When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
+# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
+# ratios are generated by an anchor generator.
+# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
+# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
+# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
+# for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
+# Anchor angles.
+# list[list[float]], the angle in degrees, for each input feature map.
+# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
+_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
+# Relative offset between the center of the first anchor and the top-left corner of the image
+# Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
+# The value is not expected to affect model accuracy.
+_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead" # used by RPN_HEAD_REGISTRY
+
+# Names of the input feature maps to be used by RPN
+# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
+_C.MODEL.RPN.IN_FEATURES = ["res4"]
+# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.BOUNDARY_THRESH = -1
+# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example: 1)
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example: 0)
+# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
+# are ignored (-1)
+_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
+_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
+# Number of regions per image used to train RPN
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1"
+_C.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0
+# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
+_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
+_C.MODEL.RPN.LOSS_WEIGHT = 1.0
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+# When FPN is used, this limit is applied per level and then again to the union
+# of proposals from all levels
+# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
+# It means per-batch topk in Detectron1, but per-image topk here.
+# See the "find_top_rpn_proposals" function for details.
+_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Set this to -1 to use the same number of output channels as input channels.
+_C.MODEL.RPN.CONV_DIMS = [-1]
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
+# Number of foreground classes
+_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
+# Names of the input feature maps to be used by ROI heads
+# Currently all heads (box, mask, ...) use the same input feature map list
+# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
+_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
+# IOU overlap ratios [IOU_THRESHOLD]
+# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
+# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
+_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
+# RoI minibatch size *per image* (number of regions of interest [ROIs]) during training
+# Total number of RoIs per training minibatch =
+# ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 16 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
+# inference.
+_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
+# If True, augment proposals with ground-truth boxes before sampling proposals to
+# train ROI heads.
+_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
+
+# ---------------------------------------------------------------------------- #
+# Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_HEAD = CN()
+# C4 don't use head name option
+# Options for non-C4 models: FastRCNNConvFCHead,
+_C.MODEL.ROI_BOX_HEAD.NAME = ""
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1"
+# The final scaling coefficient on the box regression loss, used to balance the magnitude of its
+# gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`.
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
+# Hidden layer dimension for FC layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
+_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
+# Channel dimension for Conv layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_BOX_HEAD.NORM = ""
+# Whether to use class agnostic for bbox regression
+_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
+# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
+_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False
+
+# Federated loss can be used to improve the training of LVIS
+_C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False
+# Sigmoid cross entrophy is used with federated loss
+_C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False
+# The power value applied to image_count when calcualting frequency weight
+_C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER = 0.5
+# Number of classes to keep in total
+_C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES = 50
+
+# ---------------------------------------------------------------------------- #
+# Cascaded Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
+# The number of cascade stages is implicitly defined by the length of the following two configs.
+_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
+ (10.0, 10.0, 5.0, 5.0),
+ (20.0, 20.0, 10.0, 10.0),
+ (30.0, 30.0, 15.0, 15.0),
+)
+_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
+
+
+# ---------------------------------------------------------------------------- #
+# Mask Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0 # The number of convs in the mask head
+_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_MASK_HEAD.NORM = ""
+# Whether to use class agnostic for mask prediction
+_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+
+# ---------------------------------------------------------------------------- #
+# Keypoint Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17 # 17 is the number of keypoints in COCO.
+
+# Images with too few (or no) keypoints are excluded from training.
+_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
+# Normalize by the total number of visible keypoints in the minibatch if True.
+# Otherwise, normalize by the total number of keypoints that could ever exist
+# in the minibatch.
+# The keypoint softmax loss is only calculated on visible keypoints.
+# Since the number of visible keypoints can vary significantly between
+# minibatches, this has the effect of up-weighting the importance of
+# minibatches with few visible keypoints. (Imagine the extreme case of
+# only one visible keypoint versus N: in the case of N, each one
+# contributes 1/N to the gradient compared to the single keypoint
+# determining the gradient direction). Instead, we can normalize the
+# loss by the total number of keypoints, if it were the case that all
+# keypoints were visible in a full minibatch. (Returning to the example,
+# this means that the one visible keypoint contributes as much as each
+# of the N keypoints.)
+_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
+# Multi-task loss weight to use for keypoints
+# Recommended values:
+# - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
+# - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
+_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+# ---------------------------------------------------------------------------- #
+# Semantic Segmentation Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SEM_SEG_HEAD = CN()
+_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
+_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
+# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
+# the correposnding pixel.
+_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
+# Number of classes in the semantic segmentation head
+_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
+# Number of channels in the 3x3 convs inside semantic-FPN heads.
+_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
+# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
+_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
+# Normalization method for the convolution layers. Options: "" (no norm), "GN".
+_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
+_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
+
+_C.MODEL.PANOPTIC_FPN = CN()
+# Scaling of all losses from instance detection / segmentation head.
+_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
+
+# options when combining instance & semantic segmentation outputs
+_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True}) # "COMBINE.ENABLED" is deprecated & not used
+_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
+_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
+_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
+
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+
+# This is the number of foreground classes.
+_C.MODEL.RETINANET.NUM_CLASSES = 80
+
+_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+
+# IoU overlap ratio [bg, fg] for labeling anchors.
+# Anchors with < bg are labeled negative (0)
+# Anchors with >= bg and < fg are ignored (-1)
+# Anchors with >= fg are labeled positive (1)
+_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
+_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
+
+# Prior prob for rare case (i.e. foreground) at the beginning of training.
+# This is used to set the bias for the logits layer of the classifier subnet.
+# This improves training stability in the case of heavy class imbalance.
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+
+# Inference cls score threshold, only anchors with score > INFERENCE_TH are
+# considered for inference (to improve speed)
+_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+# Select topk candidates before NMS
+_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
+_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
+
+# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
+_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Loss parameters
+_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
+_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
+_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"
+
+# One of BN, SyncBN, FrozenBN, GN
+# Only supports GN until unshared norm is implemented
+_C.MODEL.RETINANET.NORM = ""
+
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.DEPTH = 50
+_C.MODEL.RESNETS.OUT_FEATURES = ["res4"] # res4 for C4 backbone, res2..5 for FPN backbone
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Options: FrozenBN, GN, "SyncBN", "BN"
+_C.MODEL.RESNETS.NORM = "FrozenBN"
+
+# Baseline width of each group.
+# Scaling this parameters will scale the width of all bottleneck layers.
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
+# For R18 and R34, this needs to be set to 64
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+# Apply Deformable Convolution in stages
+# Specify if apply deform_conv on Res2, Res3, Res4, Res5
+_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
+# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
+# Use False for DeformableV1.
+_C.MODEL.RESNETS.DEFORM_MODULATED = False
+# Number of groups in deformable conv.
+_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
+
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+
+# Options: WarmupMultiStepLR, WarmupCosineLR.
+# See detectron2/solver/build.py for definition.
+_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
+
+_C.SOLVER.MAX_ITER = 40000
+
+_C.SOLVER.BASE_LR = 0.001
+# The end lr, only used by WarmupCosineLR
+_C.SOLVER.BASE_LR_END = 0.0
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.NESTEROV = False
+
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+# The weight decay that's applied to parameters of normalization layers
+# (typically the affine transformation)
+_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
+
+_C.SOLVER.GAMMA = 0.1
+# The iteration number to decrease learning rate by GAMMA.
+_C.SOLVER.STEPS = (30000,)
+# Number of decays in WarmupStepWithFixedGammaLR schedule
+_C.SOLVER.NUM_DECAYS = 3
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
+_C.SOLVER.WARMUP_ITERS = 1000
+_C.SOLVER.WARMUP_METHOD = "linear"
+# Whether to rescale the interval for the learning schedule after warmup
+_C.SOLVER.RESCALE_INTERVAL = False
+
+# Save a checkpoint after every this number of iterations
+_C.SOLVER.CHECKPOINT_PERIOD = 5000
+
+# Number of images per batch across all machines. This is also the number
+# of training images per step (i.e. per iteration). If we use 16 GPUs
+# and IMS_PER_BATCH = 32, each GPU will see 2 images per batch.
+# May be adjusted automatically if REFERENCE_WORLD_SIZE is set.
+_C.SOLVER.IMS_PER_BATCH = 16
+
+# The reference number of workers (GPUs) this config is meant to train with.
+# It takes no effect when set to 0.
+# With a non-zero value, it will be used by DefaultTrainer to compute a desired
+# per-worker batch size, and then scale the other related configs (total batch size,
+# learning rate, etc) to match the per-worker batch size.
+# See documentation of `DefaultTrainer.auto_scale_workers` for details:
+_C.SOLVER.REFERENCE_WORLD_SIZE = 0
+
+# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
+# biases. This is not useful (at least for recent models). You should avoid
+# changing these and they exist only to reproduce Detectron v1 training if
+# desired.
+_C.SOLVER.BIAS_LR_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_BIAS = None # None means following WEIGHT_DECAY
+
+# Gradient clipping
+_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
+# Type of gradient clipping, currently 2 values are supported:
+# - "value": the absolute values of elements of each gradients are clipped
+# - "norm": the norm of the gradient for each parameter is clipped thus
+# affecting all elements in the parameter
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
+# Maximum absolute value used for clipping gradients
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
+# Floating point number p for L-p norm to be used with the "norm"
+# gradient clipping type; for L-inf, please specify .inf
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+
+# Enable automatic mixed precision for training
+# Note that this does not change model's inference behavior.
+# To use AMP in inference, run inference under autocast()
+_C.SOLVER.AMP = CN({"ENABLED": False})
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# For end-to-end tests to verify the expected accuracy.
+# Each item is [task, metric, value, tolerance]
+# e.g.: [['bbox', 'AP', 38.5, 0.2]]
+_C.TEST.EXPECTED_RESULTS = []
+# The period (in terms of steps) to evaluate the model during training.
+# Set to 0 to disable.
+_C.TEST.EVAL_PERIOD = 0
+# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
+# When empty, it will use the defaults in COCO.
+# Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+_C.TEST.KEYPOINT_OKS_SIGMAS = []
+# Maximum number of detections to return per image during inference (100 is
+# based on the limit established for the COCO dataset).
+_C.TEST.DETECTIONS_PER_IMAGE = 100
+
+_C.TEST.AUG = CN({"ENABLED": False})
+_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+_C.TEST.AUG.MAX_SIZE = 4000
+_C.TEST.AUG.FLIP = True
+
+_C.TEST.PRECISE_BN = CN({"ENABLED": False})
+_C.TEST.PRECISE_BN.NUM_ITER = 200
+
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Directory where output files are written
+_C.OUTPUT_DIR = "./output"
+# Set seed to negative to fully randomize everything.
+# Set seed to positive to use a fixed seed. Note that a fixed seed increases
+# reproducibility but does not guarantee fully deterministic behavior.
+# Disabling all parallelism further increases reproducibility.
+_C.SEED = -1
+# Benchmark different cudnn algorithms.
+# If input images have very different sizes, this option will have large overhead
+# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
+# If input images have the same or similar sizes, benchmark is often helpful.
+_C.CUDNN_BENCHMARK = False
+# The period (in terms of steps) for minibatch visualization at train time.
+# Set to 0 to disable.
+_C.VIS_PERIOD = 0
+
+# global config is for quick hack purposes.
+# You can set them in command line or config files,
+# and access it with:
+#
+# from custom_detectron2.config import global_cfg
+# print(global_cfg.HACK)
+#
+# Do not commit any configs into it.
+_C.GLOBAL = CN()
+_C.GLOBAL.HACK = 1.0
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/config/instantiate.py b/comfyui_controlnet_aux/src/custom_detectron2/config/instantiate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1982849052b0fa61c8d906d4c2947268f564660e
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/config/instantiate.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import collections.abc as abc
+import dataclasses
+import logging
+from typing import Any
+
+from custom_detectron2.utils.registry import _convert_target_to_string, locate
+
+__all__ = ["dump_dataclass", "instantiate"]
+
+
+def dump_dataclass(obj: Any):
+ """
+ Dump a dataclass recursively into a dict that can be later instantiated.
+
+ Args:
+ obj: a dataclass object
+
+ Returns:
+ dict
+ """
+ assert dataclasses.is_dataclass(obj) and not isinstance(
+ obj, type
+ ), "dump_dataclass() requires an instance of a dataclass."
+ ret = {"_target_": _convert_target_to_string(type(obj))}
+ for f in dataclasses.fields(obj):
+ v = getattr(obj, f.name)
+ if dataclasses.is_dataclass(v):
+ v = dump_dataclass(v)
+ if isinstance(v, (list, tuple)):
+ v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
+ ret[f.name] = v
+ return ret
+
+
+def instantiate(cfg):
+ """
+ Recursively instantiate objects defined in dictionaries by
+ "_target_" and arguments.
+
+ Args:
+ cfg: a dict-like object with "_target_" that defines the caller, and
+ other keys that define the arguments
+
+ Returns:
+ object instantiated by cfg
+ """
+ from omegaconf import ListConfig, DictConfig, OmegaConf
+
+ if isinstance(cfg, ListConfig):
+ lst = [instantiate(x) for x in cfg]
+ return ListConfig(lst, flags={"allow_objects": True})
+ if isinstance(cfg, list):
+ # Specialize for list, because many classes take
+ # list[objects] as arguments, such as ResNet, DatasetMapper
+ return [instantiate(x) for x in cfg]
+
+ # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config),
+ # instantiate it to the actual dataclass.
+ if isinstance(cfg, DictConfig) and dataclasses.is_dataclass(cfg._metadata.object_type):
+ return OmegaConf.to_object(cfg)
+
+ if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+ # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
+ # but faster: https://github.com/facebookresearch/hydra/issues/1200
+ cfg = {k: instantiate(v) for k, v in cfg.items()}
+ cls = cfg.pop("_target_")
+ cls = instantiate(cls)
+
+ if isinstance(cls, str):
+ cls_name = cls
+ cls = locate(cls_name)
+ assert cls is not None, cls_name
+ else:
+ try:
+ cls_name = cls.__module__ + "." + cls.__qualname__
+ except Exception:
+ # target could be anything, so the above could fail
+ cls_name = str(cls)
+ assert callable(cls), f"_target_ {cls} does not define a callable object"
+ try:
+ return cls(**cfg)
+ except TypeError:
+ logger = logging.getLogger(__name__)
+ logger.error(f"Error when instantiating {cls_name}!")
+ raise
+ return cfg # return as-is if don't know what to do
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/config/lazy.py b/comfyui_controlnet_aux/src/custom_detectron2/config/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a7937bb69d9b5c8295c61b1f758781bbe02b67
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/config/lazy.py
@@ -0,0 +1,435 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import ast
+import builtins
+import collections.abc as abc
+import importlib
+import inspect
+import logging
+import os
+import uuid
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import is_dataclass
+from typing import List, Tuple, Union
+import yaml
+from omegaconf import DictConfig, ListConfig, OmegaConf, SCMode
+
+from custom_detectron2.utils.file_io import PathManager
+from custom_detectron2.utils.registry import _convert_target_to_string
+
+__all__ = ["LazyCall", "LazyConfig"]
+
+
+class LazyCall:
+ """
+ Wrap a callable so that when it's called, the call will not be executed,
+ but returns a dict that describes the call.
+
+ LazyCall object has to be called with only keyword arguments. Positional
+ arguments are not yet supported.
+
+ Examples:
+ ::
+ from custom_detectron2.config import instantiate, LazyCall
+
+ layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32)
+ layer_cfg.out_channels = 64 # can edit it afterwards
+ layer = instantiate(layer_cfg)
+ """
+
+ def __init__(self, target):
+ if not (callable(target) or isinstance(target, (str, abc.Mapping))):
+ raise TypeError(
+ f"target of LazyCall must be a callable or defines a callable! Got {target}"
+ )
+ self._target = target
+
+ def __call__(self, **kwargs):
+ if is_dataclass(self._target):
+ # omegaconf object cannot hold dataclass type
+ # https://github.com/omry/omegaconf/issues/784
+ target = _convert_target_to_string(self._target)
+ else:
+ target = self._target
+ kwargs["_target_"] = target
+
+ return DictConfig(content=kwargs, flags={"allow_objects": True})
+
+
+def _visit_dict_config(cfg, func):
+ """
+ Apply func recursively to all DictConfig in cfg.
+ """
+ if isinstance(cfg, DictConfig):
+ func(cfg)
+ for v in cfg.values():
+ _visit_dict_config(v, func)
+ elif isinstance(cfg, ListConfig):
+ for v in cfg:
+ _visit_dict_config(v, func)
+
+
+def _validate_py_syntax(filename):
+ # see also https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
+ with PathManager.open(filename, "r") as f:
+ content = f.read()
+ try:
+ ast.parse(content)
+ except SyntaxError as e:
+ raise SyntaxError(f"Config file {filename} has syntax error!") from e
+
+
+def _cast_to_config(obj):
+ # if given a dict, return DictConfig instead
+ if isinstance(obj, dict):
+ return DictConfig(obj, flags={"allow_objects": True})
+ return obj
+
+
+_CFG_PACKAGE_NAME = "detectron2._cfg_loader"
+"""
+A namespace to put all imported config into.
+"""
+
+
+def _random_package_name(filename):
+ # generate a random package name when loading config files
+ return _CFG_PACKAGE_NAME + str(uuid.uuid4())[:4] + "." + os.path.basename(filename)
+
+
+@contextmanager
+def _patch_import():
+ """
+ Enhance relative import statements in config files, so that they:
+ 1. locate files purely based on relative location, regardless of packages.
+ e.g. you can import file without having __init__
+ 2. do not cache modules globally; modifications of module states has no side effect
+ 3. support other storage system through PathManager, so config files can be in the cloud
+ 4. imported dict are turned into omegaconf.DictConfig automatically
+ """
+ old_import = builtins.__import__
+
+ def find_relative_file(original_file, relative_import_path, level):
+ # NOTE: "from . import x" is not handled. Because then it's unclear
+ # if such import should produce `x` as a python module or DictConfig.
+ # This can be discussed further if needed.
+ relative_import_err = """
+Relative import of directories is not allowed within config files.
+Within a config file, relative import can only import other config files.
+""".replace(
+ "\n", " "
+ )
+ if not len(relative_import_path):
+ raise ImportError(relative_import_err)
+
+ cur_file = os.path.dirname(original_file)
+ for _ in range(level - 1):
+ cur_file = os.path.dirname(cur_file)
+ cur_name = relative_import_path.lstrip(".")
+ for part in cur_name.split("."):
+ cur_file = os.path.join(cur_file, part)
+ if not cur_file.endswith(".py"):
+ cur_file += ".py"
+ if not PathManager.isfile(cur_file):
+ cur_file_no_suffix = cur_file[: -len(".py")]
+ if PathManager.isdir(cur_file_no_suffix):
+ raise ImportError(f"Cannot import from {cur_file_no_suffix}." + relative_import_err)
+ else:
+ raise ImportError(
+ f"Cannot import name {relative_import_path} from "
+ f"{original_file}: {cur_file} does not exist."
+ )
+ return cur_file
+
+ def new_import(name, globals=None, locals=None, fromlist=(), level=0):
+ if (
+ # Only deal with relative imports inside config files
+ level != 0
+ and globals is not None
+ and (globals.get("__package__", "") or "").startswith(_CFG_PACKAGE_NAME)
+ ):
+ cur_file = find_relative_file(globals["__file__"], name, level)
+ _validate_py_syntax(cur_file)
+ spec = importlib.machinery.ModuleSpec(
+ _random_package_name(cur_file), None, origin=cur_file
+ )
+ module = importlib.util.module_from_spec(spec)
+ module.__file__ = cur_file
+ with PathManager.open(cur_file) as f:
+ content = f.read()
+ exec(compile(content, cur_file, "exec"), module.__dict__)
+ for name in fromlist: # turn imported dict into DictConfig automatically
+ val = _cast_to_config(module.__dict__[name])
+ module.__dict__[name] = val
+ return module
+ return old_import(name, globals, locals, fromlist=fromlist, level=level)
+
+ builtins.__import__ = new_import
+ yield new_import
+ builtins.__import__ = old_import
+
+
+class LazyConfig:
+ """
+ Provide methods to save, load, and overrides an omegaconf config object
+ which may contain definition of lazily-constructed objects.
+ """
+
+ @staticmethod
+ def load_rel(filename: str, keys: Union[None, str, Tuple[str, ...]] = None):
+ """
+ Similar to :meth:`load()`, but load path relative to the caller's
+ source file.
+
+ This has the same functionality as a relative import, except that this method
+ accepts filename as a string, so more characters are allowed in the filename.
+ """
+ caller_frame = inspect.stack()[1]
+ caller_fname = caller_frame[0].f_code.co_filename
+ assert caller_fname != "", "load_rel Unable to find caller"
+ caller_dir = os.path.dirname(caller_fname)
+ filename = os.path.join(caller_dir, filename)
+ return LazyConfig.load(filename, keys)
+
+ @staticmethod
+ def load(filename: str, keys: Union[None, str, Tuple[str, ...]] = None):
+ """
+ Load a config file.
+
+ Args:
+ filename: absolute path or relative path w.r.t. the current working directory
+ keys: keys to load and return. If not given, return all keys
+ (whose values are config objects) in a dict.
+ """
+ has_keys = keys is not None
+ filename = filename.replace("/./", "/") # redundant
+ if os.path.splitext(filename)[1] not in [".py", ".yaml", ".yml"]:
+ raise ValueError(f"Config file {filename} has to be a python or yaml file.")
+ if filename.endswith(".py"):
+ _validate_py_syntax(filename)
+
+ with _patch_import():
+ # Record the filename
+ module_namespace = {
+ "__file__": filename,
+ "__package__": _random_package_name(filename),
+ }
+ with PathManager.open(filename) as f:
+ content = f.read()
+ # Compile first with filename to:
+ # 1. make filename appears in stacktrace
+ # 2. make load_rel able to find its parent's (possibly remote) location
+ exec(compile(content, filename, "exec"), module_namespace)
+
+ ret = module_namespace
+ else:
+ with PathManager.open(filename) as f:
+ obj = yaml.unsafe_load(f)
+ ret = OmegaConf.create(obj, flags={"allow_objects": True})
+
+ if has_keys:
+ if isinstance(keys, str):
+ return _cast_to_config(ret[keys])
+ else:
+ return tuple(_cast_to_config(ret[a]) for a in keys)
+ else:
+ if filename.endswith(".py"):
+ # when not specified, only load those that are config objects
+ ret = DictConfig(
+ {
+ name: _cast_to_config(value)
+ for name, value in ret.items()
+ if isinstance(value, (DictConfig, ListConfig, dict))
+ and not name.startswith("_")
+ },
+ flags={"allow_objects": True},
+ )
+ return ret
+
+ @staticmethod
+ def save(cfg, filename: str):
+ """
+ Save a config object to a yaml file.
+ Note that when the config dictionary contains complex objects (e.g. lambda),
+ it can't be saved to yaml. In that case we will print an error and
+ attempt to save to a pkl file instead.
+
+ Args:
+ cfg: an omegaconf config object
+ filename: yaml file name to save the config file
+ """
+ logger = logging.getLogger(__name__)
+ try:
+ cfg = deepcopy(cfg)
+ except Exception:
+ pass
+ else:
+ # if it's deep-copyable, then...
+ def _replace_type_by_name(x):
+ if "_target_" in x and callable(x._target_):
+ try:
+ x._target_ = _convert_target_to_string(x._target_)
+ except AttributeError:
+ pass
+
+ # not necessary, but makes yaml looks nicer
+ _visit_dict_config(cfg, _replace_type_by_name)
+
+ save_pkl = False
+ try:
+ dict = OmegaConf.to_container(
+ cfg,
+ # Do not resolve interpolation when saving, i.e. do not turn ${a} into
+ # actual values when saving.
+ resolve=False,
+ # Save structures (dataclasses) in a format that can be instantiated later.
+ # Without this option, the type information of the dataclass will be erased.
+ structured_config_mode=SCMode.INSTANTIATE,
+ )
+ dumped = yaml.dump(dict, default_flow_style=None, allow_unicode=True, width=9999)
+ with PathManager.open(filename, "w") as f:
+ f.write(dumped)
+
+ try:
+ _ = yaml.unsafe_load(dumped) # test that it is loadable
+ except Exception:
+ logger.warning(
+ "The config contains objects that cannot serialize to a valid yaml. "
+ f"{filename} is human-readable but cannot be loaded."
+ )
+ save_pkl = True
+ except Exception:
+ logger.exception("Unable to serialize the config to yaml. Error:")
+ save_pkl = True
+
+ if save_pkl:
+ new_filename = filename + ".pkl"
+ # try:
+ # # retry by pickle
+ # with PathManager.open(new_filename, "wb") as f:
+ # cloudpickle.dump(cfg, f)
+ # logger.warning(f"Config is saved using cloudpickle at {new_filename}.")
+ # except Exception:
+ # pass
+
+ @staticmethod
+ def apply_overrides(cfg, overrides: List[str]):
+ """
+ In-place override contents of cfg.
+
+ Args:
+ cfg: an omegaconf config object
+ overrides: list of strings in the format of "a=b" to override configs.
+ See https://hydra.cc/docs/next/advanced/override_grammar/basic/
+ for syntax.
+
+ Returns:
+ the cfg object
+ """
+
+ def safe_update(cfg, key, value):
+ parts = key.split(".")
+ for idx in range(1, len(parts)):
+ prefix = ".".join(parts[:idx])
+ v = OmegaConf.select(cfg, prefix, default=None)
+ if v is None:
+ break
+ if not OmegaConf.is_config(v):
+ raise KeyError(
+ f"Trying to update key {key}, but {prefix} "
+ f"is not a config, but has type {type(v)}."
+ )
+ OmegaConf.update(cfg, key, value, merge=True)
+
+ try:
+ from hydra.core.override_parser.overrides_parser import OverridesParser
+
+ has_hydra = True
+ except ImportError:
+ has_hydra = False
+
+ if has_hydra:
+ parser = OverridesParser.create()
+ overrides = parser.parse_overrides(overrides)
+ for o in overrides:
+ key = o.key_or_group
+ value = o.value()
+ if o.is_delete():
+ # TODO support this
+ raise NotImplementedError("deletion is not yet a supported override")
+ safe_update(cfg, key, value)
+ else:
+ # Fallback. Does not support all the features and error checking like hydra.
+ for o in overrides:
+ key, value = o.split("=")
+ try:
+ value = eval(value, {})
+ except NameError:
+ pass
+ safe_update(cfg, key, value)
+ return cfg
+
+ # @staticmethod
+ # def to_py(cfg, prefix: str = "cfg."):
+ # """
+ # Try to convert a config object into Python-like psuedo code.
+ #
+ # Note that perfect conversion is not always possible. So the returned
+ # results are mainly meant to be human-readable, and not meant to be executed.
+ #
+ # Args:
+ # cfg: an omegaconf config object
+ # prefix: root name for the resulting code (default: "cfg.")
+ #
+ #
+ # Returns:
+ # str of formatted Python code
+ # """
+ # import black
+ #
+ # cfg = OmegaConf.to_container(cfg, resolve=True)
+ #
+ # def _to_str(obj, prefix=None, inside_call=False):
+ # if prefix is None:
+ # prefix = []
+ # if isinstance(obj, abc.Mapping) and "_target_" in obj:
+ # # Dict representing a function call
+ # target = _convert_target_to_string(obj.pop("_target_"))
+ # args = []
+ # for k, v in sorted(obj.items()):
+ # args.append(f"{k}={_to_str(v, inside_call=True)}")
+ # args = ", ".join(args)
+ # call = f"{target}({args})"
+ # return "".join(prefix) + call
+ # elif isinstance(obj, abc.Mapping) and not inside_call:
+ # # Dict that is not inside a call is a list of top-level config objects that we
+ # # render as one object per line with dot separated prefixes
+ # key_list = []
+ # for k, v in sorted(obj.items()):
+ # if isinstance(v, abc.Mapping) and "_target_" not in v:
+ # key_list.append(_to_str(v, prefix=prefix + [k + "."]))
+ # else:
+ # key = "".join(prefix) + k
+ # key_list.append(f"{key}={_to_str(v)}")
+ # return "\n".join(key_list)
+ # elif isinstance(obj, abc.Mapping):
+ # # Dict that is inside a call is rendered as a regular dict
+ # return (
+ # "{"
+ # + ",".join(
+ # f"{repr(k)}: {_to_str(v, inside_call=inside_call)}"
+ # for k, v in sorted(obj.items())
+ # )
+ # + "}"
+ # )
+ # elif isinstance(obj, list):
+ # return "[" + ",".join(_to_str(x, inside_call=inside_call) for x in obj) + "]"
+ # else:
+ # return repr(obj)
+ #
+ # py_str = _to_str(cfg, prefix=[prefix])
+ # try:
+ # return black.format_str(py_str, mode=black.Mode())
+ # except black.InvalidInput:
+ # return py_str
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e59c84f3d5f73c9b1680fc72554730e08f2b04
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import transforms # isort:skip
+
+from .build import (
+ build_batch_data_loader,
+ build_detection_test_loader,
+ build_detection_train_loader,
+ get_detection_dataset_dicts,
+ load_proposals_into_dataset,
+ print_instances_class_histogram,
+)
+from .catalog import DatasetCatalog, MetadataCatalog, Metadata
+from .common import DatasetFromList, MapDataset, ToIterableDataset
+from .dataset_mapper import DatasetMapper
+
+# ensure the builtin datasets are registered
+from . import datasets, samplers # isort:skip
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/benchmark.py b/comfyui_controlnet_aux/src/custom_detectron2/data/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..737008498a41e4416e0a56202eafb4905d77f54c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/benchmark.py
@@ -0,0 +1,225 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from itertools import count
+from typing import List, Tuple
+import torch
+import tqdm
+from fvcore.common.timer import Timer
+
+from custom_detectron2.utils import comm
+
+from .build import build_batch_data_loader
+from .common import DatasetFromList, MapDataset
+from .samplers import TrainingSampler
+
+logger = logging.getLogger(__name__)
+
+
+class _EmptyMapDataset(torch.utils.data.Dataset):
+ """
+ Map anything to emptiness.
+ """
+
+ def __init__(self, dataset):
+ self.ds = dataset
+
+ def __len__(self):
+ return len(self.ds)
+
+ def __getitem__(self, idx):
+ _ = self.ds[idx]
+ return [0]
+
+
+def iter_benchmark(
+ iterator, num_iter: int, warmup: int = 5, max_time_seconds: float = 60
+) -> Tuple[float, List[float]]:
+ """
+ Benchmark an iterator/iterable for `num_iter` iterations with an extra
+ `warmup` iterations of warmup.
+ End early if `max_time_seconds` time is spent on iterations.
+
+ Returns:
+ float: average time (seconds) per iteration
+ list[float]: time spent on each iteration. Sometimes useful for further analysis.
+ """
+ num_iter, warmup = int(num_iter), int(warmup)
+
+ iterator = iter(iterator)
+ for _ in range(warmup):
+ next(iterator)
+ timer = Timer()
+ all_times = []
+ for curr_iter in tqdm.trange(num_iter):
+ start = timer.seconds()
+ if start > max_time_seconds:
+ num_iter = curr_iter
+ break
+ next(iterator)
+ all_times.append(timer.seconds() - start)
+ avg = timer.seconds() / num_iter
+ return avg, all_times
+
+
+class DataLoaderBenchmark:
+ """
+ Some common benchmarks that help understand perf bottleneck of a standard dataloader
+ made of dataset, mapper and sampler.
+ """
+
+ def __init__(
+ self,
+ dataset,
+ *,
+ mapper,
+ sampler=None,
+ total_batch_size,
+ num_workers=0,
+ max_time_seconds: int = 90,
+ ):
+ """
+ Args:
+ max_time_seconds (int): maximum time to spent for each benchmark
+ other args: same as in `build.py:build_detection_train_loader`
+ """
+ if isinstance(dataset, list):
+ dataset = DatasetFromList(dataset, copy=False, serialize=True)
+ if sampler is None:
+ sampler = TrainingSampler(len(dataset))
+
+ self.dataset = dataset
+ self.mapper = mapper
+ self.sampler = sampler
+ self.total_batch_size = total_batch_size
+ self.num_workers = num_workers
+ self.per_gpu_batch_size = self.total_batch_size // comm.get_world_size()
+
+ self.max_time_seconds = max_time_seconds
+
+ def _benchmark(self, iterator, num_iter, warmup, msg=None):
+ avg, all_times = iter_benchmark(iterator, num_iter, warmup, self.max_time_seconds)
+ if msg is not None:
+ self._log_time(msg, avg, all_times)
+ return avg, all_times
+
+ def _log_time(self, msg, avg, all_times, distributed=False):
+ percentiles = [np.percentile(all_times, k, interpolation="nearest") for k in [1, 5, 95, 99]]
+ if not distributed:
+ logger.info(
+ f"{msg}: avg={1.0/avg:.1f} it/s, "
+ f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
+ f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
+ )
+ return
+ avg_per_gpu = comm.all_gather(avg)
+ percentiles_per_gpu = comm.all_gather(percentiles)
+ if comm.get_rank() > 0:
+ return
+ for idx, avg, percentiles in zip(count(), avg_per_gpu, percentiles_per_gpu):
+ logger.info(
+ f"GPU{idx} {msg}: avg={1.0/avg:.1f} it/s, "
+ f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
+ f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
+ )
+
+ def benchmark_dataset(self, num_iter, warmup=5):
+ """
+ Benchmark the speed of taking raw samples from the dataset.
+ """
+
+ def loader():
+ while True:
+ for k in self.sampler:
+ yield self.dataset[k]
+
+ self._benchmark(loader(), num_iter, warmup, "Dataset Alone")
+
+ def benchmark_mapper(self, num_iter, warmup=5):
+ """
+ Benchmark the speed of taking raw samples from the dataset and map
+ them in a single process.
+ """
+
+ def loader():
+ while True:
+ for k in self.sampler:
+ yield self.mapper(self.dataset[k])
+
+ self._benchmark(loader(), num_iter, warmup, "Single Process Mapper (sec/sample)")
+
+ def benchmark_workers(self, num_iter, warmup=10):
+ """
+ Benchmark the dataloader by tuning num_workers to [0, 1, self.num_workers].
+ """
+ candidates = [0, 1]
+ if self.num_workers not in candidates:
+ candidates.append(self.num_workers)
+
+ dataset = MapDataset(self.dataset, self.mapper)
+ for n in candidates:
+ loader = build_batch_data_loader(
+ dataset,
+ self.sampler,
+ self.total_batch_size,
+ num_workers=n,
+ )
+ self._benchmark(
+ iter(loader),
+ num_iter * max(n, 1),
+ warmup * max(n, 1),
+ f"DataLoader ({n} workers, bs={self.per_gpu_batch_size})",
+ )
+ del loader
+
+ def benchmark_IPC(self, num_iter, warmup=10):
+ """
+ Benchmark the dataloader where each worker outputs nothing. This
+ eliminates the IPC overhead compared to the regular dataloader.
+
+ PyTorch multiprocessing's IPC only optimizes for torch tensors.
+ Large numpy arrays or other data structure may incur large IPC overhead.
+ """
+ n = self.num_workers
+ dataset = _EmptyMapDataset(MapDataset(self.dataset, self.mapper))
+ loader = build_batch_data_loader(
+ dataset, self.sampler, self.total_batch_size, num_workers=n
+ )
+ self._benchmark(
+ iter(loader),
+ num_iter * max(n, 1),
+ warmup * max(n, 1),
+ f"DataLoader ({n} workers, bs={self.per_gpu_batch_size}) w/o comm",
+ )
+
+ def benchmark_distributed(self, num_iter, warmup=10):
+ """
+ Benchmark the dataloader in each distributed worker, and log results of
+ all workers. This helps understand the final performance as well as
+ the variances among workers.
+
+ It also prints startup time (first iter) of the dataloader.
+ """
+ gpu = comm.get_world_size()
+ dataset = MapDataset(self.dataset, self.mapper)
+ n = self.num_workers
+ loader = build_batch_data_loader(
+ dataset, self.sampler, self.total_batch_size, num_workers=n
+ )
+
+ timer = Timer()
+ loader = iter(loader)
+ next(loader)
+ startup_time = timer.seconds()
+ logger.info("Dataloader startup time: {:.2f} seconds".format(startup_time))
+
+ comm.synchronize()
+
+ avg, all_times = self._benchmark(loader, num_iter * max(n, 1), warmup * max(n, 1))
+ del loader
+ self._log_time(
+ f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})",
+ avg,
+ all_times,
+ True,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/build.py b/comfyui_controlnet_aux/src/custom_detectron2/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..274e778d7201ccba4f4953697d3c8937dfd8b0d3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/build.py
@@ -0,0 +1,556 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+import operator
+import pickle
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.utils.data as torchdata
+from tabulate import tabulate
+from termcolor import colored
+
+from custom_detectron2.config import configurable
+from custom_detectron2.structures import BoxMode
+from custom_detectron2.utils.comm import get_world_size
+from custom_detectron2.utils.env import seed_all_rng
+from custom_detectron2.utils.file_io import PathManager
+from custom_detectron2.utils.logger import _log_api_usage, log_first_n
+
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset, ToIterableDataset
+from .dataset_mapper import DatasetMapper
+from .detection_utils import check_metadata_consistency
+from .samplers import (
+ InferenceSampler,
+ RandomSubsetTrainingSampler,
+ RepeatFactorTrainingSampler,
+ TrainingSampler,
+)
+
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+
+__all__ = [
+ "build_batch_data_loader",
+ "build_detection_train_loader",
+ "build_detection_test_loader",
+ "get_detection_dataset_dicts",
+ "load_proposals_into_dataset",
+ "print_instances_class_histogram",
+]
+
+
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+ """
+ Filter out images with none annotations or only crowd annotations
+ (i.e., images without non-crowd annotations).
+ A common training-time preprocessing on COCO dataset.
+
+ Args:
+ dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+ Returns:
+ list[dict]: the same format, but filtered.
+ """
+ num_before = len(dataset_dicts)
+
+ def valid(anns):
+ for ann in anns:
+ if ann.get("iscrowd", 0) == 0:
+ return True
+ return False
+
+ dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+ num_after = len(dataset_dicts)
+ logger = logging.getLogger(__name__)
+ logger.info(
+ "Removed {} images with no usable annotations. {} images left.".format(
+ num_before - num_after, num_after
+ )
+ )
+ return dataset_dicts
+
+
+def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image):
+ """
+ Filter out images with too few number of keypoints.
+
+ Args:
+ dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+ Returns:
+ list[dict]: the same format as dataset_dicts, but filtered.
+ """
+ num_before = len(dataset_dicts)
+
+ def visible_keypoints_in_image(dic):
+ # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility
+ annotations = dic["annotations"]
+ return sum(
+ (np.array(ann["keypoints"][2::3]) > 0).sum()
+ for ann in annotations
+ if "keypoints" in ann
+ )
+
+ dataset_dicts = [
+ x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image
+ ]
+ num_after = len(dataset_dicts)
+ logger = logging.getLogger(__name__)
+ logger.info(
+ "Removed {} images with fewer than {} keypoints.".format(
+ num_before - num_after, min_keypoints_per_image
+ )
+ )
+ return dataset_dicts
+
+
+def load_proposals_into_dataset(dataset_dicts, proposal_file):
+ """
+ Load precomputed object proposals into the dataset.
+
+ The proposal file should be a pickled dict with the following keys:
+
+ - "ids": list[int] or list[str], the image ids
+ - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id
+ - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores
+ corresponding to the boxes.
+ - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``.
+
+ Args:
+ dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+ proposal_file (str): file path of pre-computed proposals, in pkl format.
+
+ Returns:
+ list[dict]: the same format as dataset_dicts, but added proposal field.
+ """
+ logger = logging.getLogger(__name__)
+ logger.info("Loading proposals from: {}".format(proposal_file))
+
+ with PathManager.open(proposal_file, "rb") as f:
+ proposals = pickle.load(f, encoding="latin1")
+
+ # Rename the key names in D1 proposal files
+ rename_keys = {"indexes": "ids", "scores": "objectness_logits"}
+ for key in rename_keys:
+ if key in proposals:
+ proposals[rename_keys[key]] = proposals.pop(key)
+
+ # Fetch the indexes of all proposals that are in the dataset
+ # Convert image_id to str since they could be int.
+ img_ids = set({str(record["image_id"]) for record in dataset_dicts})
+ id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids}
+
+ # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS'
+ bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS
+
+ for record in dataset_dicts:
+ # Get the index of the proposal
+ i = id_to_index[str(record["image_id"])]
+
+ boxes = proposals["boxes"][i]
+ objectness_logits = proposals["objectness_logits"][i]
+ # Sort the proposals in descending order of the scores
+ inds = objectness_logits.argsort()[::-1]
+ record["proposal_boxes"] = boxes[inds]
+ record["proposal_objectness_logits"] = objectness_logits[inds]
+ record["proposal_bbox_mode"] = bbox_mode
+
+ return dataset_dicts
+
+
+def print_instances_class_histogram(dataset_dicts, class_names):
+ """
+ Args:
+ dataset_dicts (list[dict]): list of dataset dicts.
+ class_names (list[str]): list of class names (zero-indexed).
+ """
+ num_classes = len(class_names)
+ hist_bins = np.arange(num_classes + 1)
+ histogram = np.zeros((num_classes,), dtype=np.int)
+ for entry in dataset_dicts:
+ annos = entry["annotations"]
+ classes = np.asarray(
+ [x["category_id"] for x in annos if not x.get("iscrowd", 0)], dtype=np.int
+ )
+ if len(classes):
+ assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
+ assert (
+ classes.max() < num_classes
+ ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
+ histogram += np.histogram(classes, bins=hist_bins)[0]
+
+ N_COLS = min(6, len(class_names) * 2)
+
+ def short_name(x):
+ # make long class names shorter. useful for lvis
+ if len(x) > 13:
+ return x[:11] + ".."
+ return x
+
+ data = list(
+ itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
+ )
+ total_num_instances = sum(data[1::2])
+ data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+ if num_classes > 1:
+ data.extend(["total", total_num_instances])
+ data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+ table = tabulate(
+ data,
+ headers=["category", "#instances"] * (N_COLS // 2),
+ tablefmt="pipe",
+ numalign="left",
+ stralign="center",
+ )
+ log_first_n(
+ logging.INFO,
+ "Distribution of instances among all {} categories:\n".format(num_classes)
+ + colored(table, "cyan"),
+ key="message",
+ )
+
+
+def get_detection_dataset_dicts(
+ names,
+ filter_empty=True,
+ min_keypoints=0,
+ proposal_files=None,
+ check_consistency=True,
+):
+ """
+ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+
+ Args:
+ names (str or list[str]): a dataset name or a list of dataset names
+ filter_empty (bool): whether to filter out images without instance annotations
+ min_keypoints (int): filter out images with fewer keypoints than
+ `min_keypoints`. Set to 0 to do nothing.
+ proposal_files (list[str]): if given, a list of object proposal files
+ that match each dataset in `names`.
+ check_consistency (bool): whether to check if datasets have consistent metadata.
+
+ Returns:
+ list[dict]: a list of dicts following the standard dataset dict format.
+ """
+ if isinstance(names, str):
+ names = [names]
+ assert len(names), names
+ dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
+
+ if isinstance(dataset_dicts[0], torchdata.Dataset):
+ if len(dataset_dicts) > 1:
+ # ConcatDataset does not work for iterable style dataset.
+ # We could support concat for iterable as well, but it's often
+ # not a good idea to concat iterables anyway.
+ return torchdata.ConcatDataset(dataset_dicts)
+ return dataset_dicts[0]
+
+ for dataset_name, dicts in zip(names, dataset_dicts):
+ assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+
+ if proposal_files is not None:
+ assert len(names) == len(proposal_files)
+ # load precomputed proposals from proposal files
+ dataset_dicts = [
+ load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+ for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+ ]
+
+ dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+
+ has_instances = "annotations" in dataset_dicts[0]
+ if filter_empty and has_instances:
+ dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+ if min_keypoints > 0 and has_instances:
+ dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+
+ if check_consistency and has_instances:
+ try:
+ class_names = MetadataCatalog.get(names[0]).thing_classes
+ check_metadata_consistency("thing_classes", names)
+ print_instances_class_histogram(dataset_dicts, class_names)
+ except AttributeError: # class names are not available for this dataset
+ pass
+
+ assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
+ return dataset_dicts
+
+
+def build_batch_data_loader(
+ dataset,
+ sampler,
+ total_batch_size,
+ *,
+ aspect_ratio_grouping=False,
+ num_workers=0,
+ collate_fn=None,
+):
+ """
+ Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are:
+ 1. support aspect ratio grouping options
+ 2. use no "batch collation", because this is common for detection training
+
+ Args:
+ dataset (torch.utils.data.Dataset): a pytorch map-style or iterable dataset.
+ sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices.
+ Must be provided iff. ``dataset`` is a map-style dataset.
+ total_batch_size, aspect_ratio_grouping, num_workers, collate_fn: see
+ :func:`build_detection_train_loader`.
+
+ Returns:
+ iterable[list]. Length of each list is the batch size of the current
+ GPU. Each element in the list comes from the dataset.
+ """
+ world_size = get_world_size()
+ assert (
+ total_batch_size > 0 and total_batch_size % world_size == 0
+ ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
+ total_batch_size, world_size
+ )
+ batch_size = total_batch_size // world_size
+
+ if isinstance(dataset, torchdata.IterableDataset):
+ assert sampler is None, "sampler must be None if dataset is IterableDataset"
+ else:
+ dataset = ToIterableDataset(dataset, sampler)
+
+ if aspect_ratio_grouping:
+ data_loader = torchdata.DataLoader(
+ dataset,
+ num_workers=num_workers,
+ collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements
+ worker_init_fn=worker_init_reset_seed,
+ ) # yield individual mapped dict
+ data_loader = AspectRatioGroupedDataset(data_loader, batch_size)
+ if collate_fn is None:
+ return data_loader
+ return MapDataset(data_loader, collate_fn)
+ else:
+ return torchdata.DataLoader(
+ dataset,
+ batch_size=batch_size,
+ drop_last=True,
+ num_workers=num_workers,
+ collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+ worker_init_fn=worker_init_reset_seed,
+ )
+
+
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+ if dataset is None:
+ dataset = get_detection_dataset_dicts(
+ cfg.DATASETS.TRAIN,
+ filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+ min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+ if cfg.MODEL.KEYPOINT_ON
+ else 0,
+ proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+ )
+ _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+
+ if mapper is None:
+ mapper = DatasetMapper(cfg, True)
+
+ if sampler is None:
+ sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+ logger = logging.getLogger(__name__)
+ if isinstance(dataset, torchdata.IterableDataset):
+ logger.info("Not using any sampler since the dataset is IterableDataset.")
+ sampler = None
+ else:
+ logger.info("Using training sampler {}".format(sampler_name))
+ if sampler_name == "TrainingSampler":
+ sampler = TrainingSampler(len(dataset))
+ elif sampler_name == "RepeatFactorTrainingSampler":
+ repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+ dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+ )
+ sampler = RepeatFactorTrainingSampler(repeat_factors)
+ elif sampler_name == "RandomSubsetTrainingSampler":
+ sampler = RandomSubsetTrainingSampler(
+ len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO
+ )
+ else:
+ raise ValueError("Unknown training sampler: {}".format(sampler_name))
+
+ return {
+ "dataset": dataset,
+ "sampler": sampler,
+ "mapper": mapper,
+ "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+ "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+ "num_workers": cfg.DATALOADER.NUM_WORKERS,
+ }
+
+
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+ dataset,
+ *,
+ mapper,
+ sampler=None,
+ total_batch_size,
+ aspect_ratio_grouping=True,
+ num_workers=0,
+ collate_fn=None,
+):
+ """
+ Build a dataloader for object detection with some default features.
+
+ Args:
+ dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+ or a pytorch dataset (either map-style or iterable). It can be obtained
+ by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+ mapper (callable): a callable which takes a sample (dict) from dataset and
+ returns the format to be consumed by the model.
+ When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+ sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+ indices to be applied on ``dataset``.
+ If ``dataset`` is map-style, the default sampler is a :class:`TrainingSampler`,
+ which coordinates an infinite random shuffle sequence across all workers.
+ Sampler must be None if ``dataset`` is iterable.
+ total_batch_size (int): total batch size across all workers.
+ aspect_ratio_grouping (bool): whether to group images with similar
+ aspect ratio for efficiency. When enabled, it requires each
+ element in dataset be a dict with keys "width" and "height".
+ num_workers (int): number of parallel data loading workers
+ collate_fn: a function that determines how to do batching, same as the argument of
+ `torch.utils.data.DataLoader`. Defaults to do no collation and return a list of
+ data. No collation is OK for small batch size and simple data structures.
+ If your batch size is large and each sample contains too many small tensors,
+ it's more efficient to collate them in data loader.
+
+ Returns:
+ torch.utils.data.DataLoader:
+ a dataloader. Each output from it is a ``list[mapped_element]`` of length
+ ``total_batch_size / num_workers``, where ``mapped_element`` is produced
+ by the ``mapper``.
+ """
+ if isinstance(dataset, list):
+ dataset = DatasetFromList(dataset, copy=False)
+ if mapper is not None:
+ dataset = MapDataset(dataset, mapper)
+
+ if isinstance(dataset, torchdata.IterableDataset):
+ assert sampler is None, "sampler must be None if dataset is IterableDataset"
+ else:
+ if sampler is None:
+ sampler = TrainingSampler(len(dataset))
+ assert isinstance(sampler, torchdata.Sampler), f"Expect a Sampler but got {type(sampler)}"
+ return build_batch_data_loader(
+ dataset,
+ sampler,
+ total_batch_size,
+ aspect_ratio_grouping=aspect_ratio_grouping,
+ num_workers=num_workers,
+ collate_fn=collate_fn,
+ )
+
+
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+ """
+ Uses the given `dataset_name` argument (instead of the names in cfg), because the
+ standard practice is to evaluate each test set individually (not combining them).
+ """
+ if isinstance(dataset_name, str):
+ dataset_name = [dataset_name]
+
+ dataset = get_detection_dataset_dicts(
+ dataset_name,
+ filter_empty=False,
+ proposal_files=[
+ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
+ ]
+ if cfg.MODEL.LOAD_PROPOSALS
+ else None,
+ )
+ if mapper is None:
+ mapper = DatasetMapper(cfg, False)
+ return {
+ "dataset": dataset,
+ "mapper": mapper,
+ "num_workers": cfg.DATALOADER.NUM_WORKERS,
+ "sampler": InferenceSampler(len(dataset))
+ if not isinstance(dataset, torchdata.IterableDataset)
+ else None,
+ }
+
+
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(
+ dataset: Union[List[Any], torchdata.Dataset],
+ *,
+ mapper: Callable[[Dict[str, Any]], Any],
+ sampler: Optional[torchdata.Sampler] = None,
+ batch_size: int = 1,
+ num_workers: int = 0,
+ collate_fn: Optional[Callable[[List[Any]], Any]] = None,
+) -> torchdata.DataLoader:
+ """
+ Similar to `build_detection_train_loader`, with default batch size = 1,
+ and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
+ to produce the exact set of all samples.
+
+ Args:
+ dataset: a list of dataset dicts,
+ or a pytorch dataset (either map-style or iterable). They can be obtained
+ by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+ mapper: a callable which takes a sample (dict) from dataset
+ and returns the format to be consumed by the model.
+ When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+ sampler: a sampler that produces
+ indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+ which splits the dataset across all workers. Sampler must be None
+ if `dataset` is iterable.
+ batch_size: the batch size of the data loader to be created.
+ Default to 1 image per worker since this is the standard when reporting
+ inference time in papers.
+ num_workers: number of parallel data loading workers
+ collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+ Defaults to do no collation and return a list of data.
+
+ Returns:
+ DataLoader: a torch DataLoader, that loads the given detection
+ dataset, with test-time transformation and batching.
+
+ Examples:
+ ::
+ data_loader = build_detection_test_loader(
+ DatasetRegistry.get("my_test"),
+ mapper=DatasetMapper(...))
+
+ # or, instantiate with a CfgNode:
+ data_loader = build_detection_test_loader(cfg, "my_test")
+ """
+ if isinstance(dataset, list):
+ dataset = DatasetFromList(dataset, copy=False)
+ if mapper is not None:
+ dataset = MapDataset(dataset, mapper)
+ if isinstance(dataset, torchdata.IterableDataset):
+ assert sampler is None, "sampler must be None if dataset is IterableDataset"
+ else:
+ if sampler is None:
+ sampler = InferenceSampler(len(dataset))
+ return torchdata.DataLoader(
+ dataset,
+ batch_size=batch_size,
+ sampler=sampler,
+ drop_last=False,
+ num_workers=num_workers,
+ collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+ )
+
+
+def trivial_batch_collator(batch):
+ """
+ A batch collator that does nothing.
+ """
+ return batch
+
+
+def worker_init_reset_seed(worker_id):
+ initial_seed = torch.initial_seed() % 2**31
+ seed_all_rng(initial_seed + worker_id)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/catalog.py b/comfyui_controlnet_aux/src/custom_detectron2/data/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b773678b724af89e15cafec5617f600c7a780d2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/catalog.py
@@ -0,0 +1,236 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import types
+from collections import UserDict
+from typing import List
+
+from custom_detectron2.utils.logger import log_first_n
+
+__all__ = ["DatasetCatalog", "MetadataCatalog", "Metadata"]
+
+
+class _DatasetCatalog(UserDict):
+ """
+ A global dictionary that stores information about the datasets and how to obtain them.
+
+ It contains a mapping from strings
+ (which are names that identify a dataset, e.g. "coco_2014_train")
+ to a function which parses the dataset and returns the samples in the
+ format of `list[dict]`.
+
+ The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details)
+ if used with the data loader functionalities in `data/build.py,data/detection_transform.py`.
+
+ The purpose of having this catalog is to make it easy to choose
+ different datasets, by just using the strings in the config.
+ """
+
+ def register(self, name, func):
+ """
+ Args:
+ name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+ func (callable): a callable which takes no arguments and returns a list of dicts.
+ It must return the same results if called multiple times.
+ """
+ assert callable(func), "You must register a function with `DatasetCatalog.register`!"
+ assert name not in self, "Dataset '{}' is already registered!".format(name)
+ self[name] = func
+
+ def get(self, name):
+ """
+ Call the registered function and return its results.
+
+ Args:
+ name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+
+ Returns:
+ list[dict]: dataset annotations.
+ """
+ try:
+ f = self[name]
+ except KeyError as e:
+ raise KeyError(
+ "Dataset '{}' is not registered! Available datasets are: {}".format(
+ name, ", ".join(list(self.keys()))
+ )
+ ) from e
+ return f()
+
+ def list(self) -> List[str]:
+ """
+ List all registered datasets.
+
+ Returns:
+ list[str]
+ """
+ return list(self.keys())
+
+ def remove(self, name):
+ """
+ Alias of ``pop``.
+ """
+ self.pop(name)
+
+ def __str__(self):
+ return "DatasetCatalog(registered datasets: {})".format(", ".join(self.keys()))
+
+ __repr__ = __str__
+
+
+DatasetCatalog = _DatasetCatalog()
+DatasetCatalog.__doc__ = (
+ _DatasetCatalog.__doc__
+ + """
+ .. automethod:: detectron2.data.catalog.DatasetCatalog.register
+ .. automethod:: detectron2.data.catalog.DatasetCatalog.get
+"""
+)
+
+
+class Metadata(types.SimpleNamespace):
+ """
+ A class that supports simple attribute setter/getter.
+ It is intended for storing metadata of a dataset and make it accessible globally.
+
+ Examples:
+ ::
+ # somewhere when you load the data:
+ MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"]
+
+ # somewhere when you print statistics or visualize:
+ classes = MetadataCatalog.get("mydataset").thing_classes
+ """
+
+ # the name of the dataset
+ # set default to N/A so that `self.name` in the errors will not trigger getattr again
+ name: str = "N/A"
+
+ _RENAMED = {
+ "class_names": "thing_classes",
+ "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id",
+ "stuff_class_names": "stuff_classes",
+ }
+
+ def __getattr__(self, key):
+ if key in self._RENAMED:
+ log_first_n(
+ logging.WARNING,
+ "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+ n=10,
+ )
+ return getattr(self, self._RENAMED[key])
+
+ # "name" exists in every metadata
+ if len(self.__dict__) > 1:
+ raise AttributeError(
+ "Attribute '{}' does not exist in the metadata of dataset '{}'. Available "
+ "keys are {}.".format(key, self.name, str(self.__dict__.keys()))
+ )
+ else:
+ raise AttributeError(
+ f"Attribute '{key}' does not exist in the metadata of dataset '{self.name}': "
+ "metadata is empty."
+ )
+
+ def __setattr__(self, key, val):
+ if key in self._RENAMED:
+ log_first_n(
+ logging.WARNING,
+ "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+ n=10,
+ )
+ setattr(self, self._RENAMED[key], val)
+
+ # Ensure that metadata of the same name stays consistent
+ try:
+ oldval = getattr(self, key)
+ assert oldval == val, (
+ "Attribute '{}' in the metadata of '{}' cannot be set "
+ "to a different value!\n{} != {}".format(key, self.name, oldval, val)
+ )
+ except AttributeError:
+ super().__setattr__(key, val)
+
+ def as_dict(self):
+ """
+ Returns all the metadata as a dict.
+ Note that modifications to the returned dict will not reflect on the Metadata object.
+ """
+ return copy.copy(self.__dict__)
+
+ def set(self, **kwargs):
+ """
+ Set multiple metadata with kwargs.
+ """
+ for k, v in kwargs.items():
+ setattr(self, k, v)
+ return self
+
+ def get(self, key, default=None):
+ """
+ Access an attribute and return its value if exists.
+ Otherwise return default.
+ """
+ try:
+ return getattr(self, key)
+ except AttributeError:
+ return default
+
+
+class _MetadataCatalog(UserDict):
+ """
+ MetadataCatalog is a global dictionary that provides access to
+ :class:`Metadata` of a given dataset.
+
+ The metadata associated with a certain name is a singleton: once created, the
+ metadata will stay alive and will be returned by future calls to ``get(name)``.
+
+ It's like global variables, so don't abuse it.
+ It's meant for storing knowledge that's constant and shared across the execution
+ of the program, e.g.: the class names in COCO.
+ """
+
+ def get(self, name):
+ """
+ Args:
+ name (str): name of a dataset (e.g. coco_2014_train).
+
+ Returns:
+ Metadata: The :class:`Metadata` instance associated with this name,
+ or create an empty one if none is available.
+ """
+ assert len(name)
+ r = super().get(name, None)
+ if r is None:
+ r = self[name] = Metadata(name=name)
+ return r
+
+ def list(self):
+ """
+ List all registered metadata.
+
+ Returns:
+ list[str]: keys (names of datasets) of all registered metadata
+ """
+ return list(self.keys())
+
+ def remove(self, name):
+ """
+ Alias of ``pop``.
+ """
+ self.pop(name)
+
+ def __str__(self):
+ return "MetadataCatalog(registered metadata: {})".format(", ".join(self.keys()))
+
+ __repr__ = __str__
+
+
+MetadataCatalog = _MetadataCatalog()
+MetadataCatalog.__doc__ = (
+ _MetadataCatalog.__doc__
+ + """
+ .. automethod:: detectron2.data.catalog.MetadataCatalog.get
+"""
+)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/common.py b/comfyui_controlnet_aux/src/custom_detectron2/data/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f2338f7efa73d05ad6afe96b094c0ac224103fa
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/common.py
@@ -0,0 +1,301 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import itertools
+import logging
+import numpy as np
+import pickle
+import random
+from typing import Callable, Union
+import torch
+import torch.utils.data as data
+from torch.utils.data.sampler import Sampler
+
+from custom_detectron2.utils.serialize import PicklableWrapper
+
+__all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset", "ToIterableDataset"]
+
+logger = logging.getLogger(__name__)
+
+
+def _shard_iterator_dataloader_worker(iterable):
+ # Shard the iterable if we're currently inside pytorch dataloader worker.
+ worker_info = data.get_worker_info()
+ if worker_info is None or worker_info.num_workers == 1:
+ # do nothing
+ yield from iterable
+ else:
+ yield from itertools.islice(iterable, worker_info.id, None, worker_info.num_workers)
+
+
+class _MapIterableDataset(data.IterableDataset):
+ """
+ Map a function over elements in an IterableDataset.
+
+ Similar to pytorch's MapIterDataPipe, but support filtering when map_func
+ returns None.
+
+ This class is not public-facing. Will be called by `MapDataset`.
+ """
+
+ def __init__(self, dataset, map_func):
+ self._dataset = dataset
+ self._map_func = PicklableWrapper(map_func) # wrap so that a lambda will work
+
+ def __len__(self):
+ return len(self._dataset)
+
+ def __iter__(self):
+ for x in map(self._map_func, self._dataset):
+ if x is not None:
+ yield x
+
+
+class MapDataset(data.Dataset):
+ """
+ Map a function over the elements in a dataset.
+ """
+
+ def __init__(self, dataset, map_func):
+ """
+ Args:
+ dataset: a dataset where map function is applied. Can be either
+ map-style or iterable dataset. When given an iterable dataset,
+ the returned object will also be an iterable dataset.
+ map_func: a callable which maps the element in dataset. map_func can
+ return None to skip the data (e.g. in case of errors).
+ How None is handled depends on the style of `dataset`.
+ If `dataset` is map-style, it randomly tries other elements.
+ If `dataset` is iterable, it skips the data and tries the next.
+ """
+ self._dataset = dataset
+ self._map_func = PicklableWrapper(map_func) # wrap so that a lambda will work
+
+ self._rng = random.Random(42)
+ self._fallback_candidates = set(range(len(dataset)))
+
+ def __new__(cls, dataset, map_func):
+ is_iterable = isinstance(dataset, data.IterableDataset)
+ if is_iterable:
+ return _MapIterableDataset(dataset, map_func)
+ else:
+ return super().__new__(cls)
+
+ def __getnewargs__(self):
+ return self._dataset, self._map_func
+
+ def __len__(self):
+ return len(self._dataset)
+
+ def __getitem__(self, idx):
+ retry_count = 0
+ cur_idx = int(idx)
+
+ while True:
+ data = self._map_func(self._dataset[cur_idx])
+ if data is not None:
+ self._fallback_candidates.add(cur_idx)
+ return data
+
+ # _map_func fails for this idx, use a random new index from the pool
+ retry_count += 1
+ self._fallback_candidates.discard(cur_idx)
+ cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0]
+
+ if retry_count >= 3:
+ logger = logging.getLogger(__name__)
+ logger.warning(
+ "Failed to apply `_map_func` for idx: {}, retry count: {}".format(
+ idx, retry_count
+ )
+ )
+
+
+class _TorchSerializedList(object):
+ """
+ A list-like object whose items are serialized and stored in a torch tensor. When
+ launching a process that uses TorchSerializedList with "fork" start method,
+ the subprocess can read the same buffer without triggering copy-on-access. When
+ launching a process that uses TorchSerializedList with "spawn/forkserver" start
+ method, the list will be pickled by a special ForkingPickler registered by PyTorch
+ that moves data to shared memory. In both cases, this allows parent and child
+ processes to share RAM for the list data, hence avoids the issue in
+ https://github.com/pytorch/pytorch/issues/13246.
+
+ See also https://ppwwyyxx.com/blog/2022/Demystify-RAM-Usage-in-Multiprocess-DataLoader/
+ on how it works.
+ """
+
+ def __init__(self, lst: list):
+ self._lst = lst
+
+ def _serialize(data):
+ buffer = pickle.dumps(data, protocol=-1)
+ return np.frombuffer(buffer, dtype=np.uint8)
+
+ logger.info(
+ "Serializing {} elements to byte tensors and concatenating them all ...".format(
+ len(self._lst)
+ )
+ )
+ self._lst = [_serialize(x) for x in self._lst]
+ self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64)
+ self._addr = torch.from_numpy(np.cumsum(self._addr))
+ self._lst = torch.from_numpy(np.concatenate(self._lst))
+ logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024**2))
+
+ def __len__(self):
+ return len(self._addr)
+
+ def __getitem__(self, idx):
+ start_addr = 0 if idx == 0 else self._addr[idx - 1].item()
+ end_addr = self._addr[idx].item()
+ bytes = memoryview(self._lst[start_addr:end_addr].numpy())
+
+ # @lint-ignore PYTHONPICKLEISBAD
+ return pickle.loads(bytes)
+
+
+_DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = _TorchSerializedList
+
+
+@contextlib.contextmanager
+def set_default_dataset_from_list_serialize_method(new):
+ """
+ Context manager for using custom serialize function when creating DatasetFromList
+ """
+
+ global _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD
+ orig = _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD
+ _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = new
+ yield
+ _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = orig
+
+
+class DatasetFromList(data.Dataset):
+ """
+ Wrap a list to a torch Dataset. It produces elements of the list as data.
+ """
+
+ def __init__(
+ self,
+ lst: list,
+ copy: bool = True,
+ serialize: Union[bool, Callable] = True,
+ ):
+ """
+ Args:
+ lst (list): a list which contains elements to produce.
+ copy (bool): whether to deepcopy the element when producing it,
+ so that the result can be modified in place without affecting the
+ source in the list.
+ serialize (bool or callable): whether to serialize the stroage to other
+ backend. If `True`, the default serialize method will be used, if given
+ a callable, the callable will be used as serialize method.
+ """
+ self._lst = lst
+ self._copy = copy
+ if not isinstance(serialize, (bool, Callable)):
+ raise TypeError(f"Unsupported type for argument `serailzie`: {serialize}")
+ self._serialize = serialize is not False
+
+ if self._serialize:
+ serialize_method = (
+ serialize
+ if isinstance(serialize, Callable)
+ else _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD
+ )
+ logger.info(f"Serializing the dataset using: {serialize_method}")
+ self._lst = serialize_method(self._lst)
+
+ def __len__(self):
+ return len(self._lst)
+
+ def __getitem__(self, idx):
+ if self._copy and not self._serialize:
+ return copy.deepcopy(self._lst[idx])
+ else:
+ return self._lst[idx]
+
+
+class ToIterableDataset(data.IterableDataset):
+ """
+ Convert an old indices-based (also called map-style) dataset
+ to an iterable-style dataset.
+ """
+
+ def __init__(self, dataset: data.Dataset, sampler: Sampler, shard_sampler: bool = True):
+ """
+ Args:
+ dataset: an old-style dataset with ``__getitem__``
+ sampler: a cheap iterable that produces indices to be applied on ``dataset``.
+ shard_sampler: whether to shard the sampler based on the current pytorch data loader
+ worker id. When an IterableDataset is forked by pytorch's DataLoader into multiple
+ workers, it is responsible for sharding its data based on worker id so that workers
+ don't produce identical data.
+
+ Most samplers (like our TrainingSampler) do not shard based on dataloader worker id
+ and this argument should be set to True. But certain samplers may be already
+ sharded, in that case this argument should be set to False.
+ """
+ assert not isinstance(dataset, data.IterableDataset), dataset
+ assert isinstance(sampler, Sampler), sampler
+ self.dataset = dataset
+ self.sampler = sampler
+ self.shard_sampler = shard_sampler
+
+ def __iter__(self):
+ if not self.shard_sampler:
+ sampler = self.sampler
+ else:
+ # With map-style dataset, `DataLoader(dataset, sampler)` runs the
+ # sampler in main process only. But `DataLoader(ToIterableDataset(dataset, sampler))`
+ # will run sampler in every of the N worker. So we should only keep 1/N of the ids on
+ # each worker. The assumption is that sampler is cheap to iterate so it's fine to
+ # discard ids in workers.
+ sampler = _shard_iterator_dataloader_worker(self.sampler)
+ for idx in sampler:
+ yield self.dataset[idx]
+
+ def __len__(self):
+ return len(self.sampler)
+
+
+class AspectRatioGroupedDataset(data.IterableDataset):
+ """
+ Batch data that have similar aspect ratio together.
+ In this implementation, images whose aspect ratio < (or >) 1 will
+ be batched together.
+ This improves training speed because the images then need less padding
+ to form a batch.
+
+ It assumes the underlying dataset produces dicts with "width" and "height" keys.
+ It will then produce a list of original dicts with length = batch_size,
+ all with similar aspect ratios.
+ """
+
+ def __init__(self, dataset, batch_size):
+ """
+ Args:
+ dataset: an iterable. Each element must be a dict with keys
+ "width" and "height", which will be used to batch data.
+ batch_size (int):
+ """
+ self.dataset = dataset
+ self.batch_size = batch_size
+ self._buckets = [[] for _ in range(2)]
+ # Hard-coded two aspect ratio groups: w > h and w < h.
+ # Can add support for more aspect ratio groups, but doesn't seem useful
+
+ def __iter__(self):
+ for d in self.dataset:
+ w, h = d["width"], d["height"]
+ bucket_id = 0 if w > h else 1
+ bucket = self._buckets[bucket_id]
+ bucket.append(d)
+ if len(bucket) == self.batch_size:
+ data = bucket[:]
+ # Clear bucket first, because code after yield is not
+ # guaranteed to execute
+ del bucket[:]
+ yield data
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/dataset_mapper.py b/comfyui_controlnet_aux/src/custom_detectron2/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..962ab660a293c26b5dd99df9b14445f926fbd616
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/dataset_mapper.py
@@ -0,0 +1,191 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+from typing import List, Optional, Union
+import torch
+
+from custom_detectron2.config import configurable
+
+from . import detection_utils as utils
+from . import transforms as T
+
+"""
+This file contains the default mapping that's applied to "dataset dicts".
+"""
+
+__all__ = ["DatasetMapper"]
+
+
+class DatasetMapper:
+ """
+ A callable which takes a dataset dict in Detectron2 Dataset format,
+ and map it into a format used by the model.
+
+ This is the default callable to be used to map your dataset dict into training data.
+ You may need to follow it to implement your own one for customized logic,
+ such as a different way to read or transform images.
+ See :doc:`/tutorials/data_loading` for details.
+
+ The callable currently does the following:
+
+ 1. Read the image from "file_name"
+ 2. Applies cropping/geometric transforms to the image and annotations
+ 3. Prepare data and annotations to Tensor and :class:`Instances`
+ """
+
+ @configurable
+ def __init__(
+ self,
+ is_train: bool,
+ *,
+ augmentations: List[Union[T.Augmentation, T.Transform]],
+ image_format: str,
+ use_instance_mask: bool = False,
+ use_keypoint: bool = False,
+ instance_mask_format: str = "polygon",
+ keypoint_hflip_indices: Optional[np.ndarray] = None,
+ precomputed_proposal_topk: Optional[int] = None,
+ recompute_boxes: bool = False,
+ ):
+ """
+ NOTE: this interface is experimental.
+
+ Args:
+ is_train: whether it's used in training or inference
+ augmentations: a list of augmentations or deterministic transforms to apply
+ image_format: an image format supported by :func:`detection_utils.read_image`.
+ use_instance_mask: whether to process instance segmentation annotations, if available
+ use_keypoint: whether to process keypoint annotations if available
+ instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
+ masks into this format.
+ keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
+ precomputed_proposal_topk: if given, will load pre-computed
+ proposals from dataset_dict and keep the top k proposals for each image.
+ recompute_boxes: whether to overwrite bounding box annotations
+ by computing tight bounding boxes from instance mask annotations.
+ """
+ if recompute_boxes:
+ assert use_instance_mask, "recompute_boxes requires instance masks"
+ # fmt: off
+ self.is_train = is_train
+ self.augmentations = T.AugmentationList(augmentations)
+ self.image_format = image_format
+ self.use_instance_mask = use_instance_mask
+ self.instance_mask_format = instance_mask_format
+ self.use_keypoint = use_keypoint
+ self.keypoint_hflip_indices = keypoint_hflip_indices
+ self.proposal_topk = precomputed_proposal_topk
+ self.recompute_boxes = recompute_boxes
+ # fmt: on
+ logger = logging.getLogger(__name__)
+ mode = "training" if is_train else "inference"
+ logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
+
+ @classmethod
+ def from_config(cls, cfg, is_train: bool = True):
+ augs = utils.build_augmentation(cfg, is_train)
+ if cfg.INPUT.CROP.ENABLED and is_train:
+ augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
+ recompute_boxes = cfg.MODEL.MASK_ON
+ else:
+ recompute_boxes = False
+
+ ret = {
+ "is_train": is_train,
+ "augmentations": augs,
+ "image_format": cfg.INPUT.FORMAT,
+ "use_instance_mask": cfg.MODEL.MASK_ON,
+ "instance_mask_format": cfg.INPUT.MASK_FORMAT,
+ "use_keypoint": cfg.MODEL.KEYPOINT_ON,
+ "recompute_boxes": recompute_boxes,
+ }
+
+ if cfg.MODEL.KEYPOINT_ON:
+ ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+
+ if cfg.MODEL.LOAD_PROPOSALS:
+ ret["precomputed_proposal_topk"] = (
+ cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+ if is_train
+ else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+ )
+ return ret
+
+ def _transform_annotations(self, dataset_dict, transforms, image_shape):
+ # USER: Modify this if you want to keep them for some reason.
+ for anno in dataset_dict["annotations"]:
+ if not self.use_instance_mask:
+ anno.pop("segmentation", None)
+ if not self.use_keypoint:
+ anno.pop("keypoints", None)
+
+ # USER: Implement additional transformations if you have other types of data
+ annos = [
+ utils.transform_instance_annotations(
+ obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+ )
+ for obj in dataset_dict.pop("annotations")
+ if obj.get("iscrowd", 0) == 0
+ ]
+ instances = utils.annotations_to_instances(
+ annos, image_shape, mask_format=self.instance_mask_format
+ )
+
+ # After transforms such as cropping are applied, the bounding box may no longer
+ # tightly bound the object. As an example, imagine a triangle object
+ # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+ # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+ # the intersection of original bounding box and the cropping box.
+ if self.recompute_boxes:
+ instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+ dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
+ def __call__(self, dataset_dict):
+ """
+ Args:
+ dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+ Returns:
+ dict: a format that builtin models in detectron2 accept
+ """
+ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
+ # USER: Write your own image loading if it's not from a file
+ image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
+ utils.check_image_size(dataset_dict, image)
+
+ # USER: Remove if you don't do semantic/panoptic segmentation.
+ if "sem_seg_file_name" in dataset_dict:
+ sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
+ else:
+ sem_seg_gt = None
+
+ aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+ transforms = self.augmentations(aug_input)
+ image, sem_seg_gt = aug_input.image, aug_input.sem_seg
+
+ image_shape = image.shape[:2] # h, w
+ # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+ # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+ # Therefore it's important to use torch.Tensor.
+ dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+ if sem_seg_gt is not None:
+ dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
+
+ # USER: Remove if you don't use pre-computed proposals.
+ # Most users would not need this feature.
+ if self.proposal_topk is not None:
+ utils.transform_proposals(
+ dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk
+ )
+
+ if not self.is_train:
+ # USER: Modify this if you want to keep them for some reason.
+ dataset_dict.pop("annotations", None)
+ dataset_dict.pop("sem_seg_file_name", None)
+ return dataset_dict
+
+ if "annotations" in dataset_dict:
+ self._transform_annotations(dataset_dict, transforms, image_shape)
+
+ return dataset_dict
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/README.md b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1ffebffd820647186ade90e71fb154e18161ee3
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/README.md
@@ -0,0 +1,9 @@
+
+
+### Common Datasets
+
+The dataset implemented here do not need to load the data into the final format.
+It should provide the minimal data structure needed to use the dataset, so it can be very efficient.
+
+For example, for an image dataset, just provide the file names and labels, but don't read the images.
+Let the downstream decide how to read.
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..72e9c9aeb5f0bd13df86db0f50e4712a4a6dbf74
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .coco import load_coco_json, load_sem_seg, register_coco_instances, convert_to_coco_json
+from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
+from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta
+from .pascal_voc import load_voc_instances, register_pascal_voc
+from . import builtin as _builtin # ensure the builtin datasets are registered
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/builtin.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ca297ce1ef84bd9c53aec67d726e8e3fc06734
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/builtin.py
@@ -0,0 +1,259 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+
+"""
+This file registers pre-defined datasets at hard-coded paths, and their metadata.
+
+We hard-code metadata for common datasets. This will enable:
+1. Consistency check when loading the datasets
+2. Use models on these standard datasets directly and run demos,
+ without having to download the dataset annotations
+
+We hard-code some paths to the dataset that's assumed to
+exist in "./datasets/".
+
+Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
+To add new dataset, refer to the tutorial "docs/DATASETS.md".
+"""
+
+import os
+
+from custom_detectron2.data import DatasetCatalog, MetadataCatalog
+
+from .builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata
+from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
+from .cityscapes_panoptic import register_all_cityscapes_panoptic
+from .coco import load_sem_seg, register_coco_instances
+from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
+from .lvis import get_lvis_instances_meta, register_lvis_instances
+from .pascal_voc import register_pascal_voc
+
+# ==== Predefined datasets and splits for COCO ==========
+
+_PREDEFINED_SPLITS_COCO = {}
+_PREDEFINED_SPLITS_COCO["coco"] = {
+ "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
+ "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
+ "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
+ "coco_2014_valminusminival": (
+ "coco/val2014",
+ "coco/annotations/instances_valminusminival2014.json",
+ ),
+ "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
+ "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
+ "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
+ "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
+ "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
+}
+
+_PREDEFINED_SPLITS_COCO["coco_person"] = {
+ "keypoints_coco_2014_train": (
+ "coco/train2014",
+ "coco/annotations/person_keypoints_train2014.json",
+ ),
+ "keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"),
+ "keypoints_coco_2014_minival": (
+ "coco/val2014",
+ "coco/annotations/person_keypoints_minival2014.json",
+ ),
+ "keypoints_coco_2014_valminusminival": (
+ "coco/val2014",
+ "coco/annotations/person_keypoints_valminusminival2014.json",
+ ),
+ "keypoints_coco_2017_train": (
+ "coco/train2017",
+ "coco/annotations/person_keypoints_train2017.json",
+ ),
+ "keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"),
+ "keypoints_coco_2017_val_100": (
+ "coco/val2017",
+ "coco/annotations/person_keypoints_val2017_100.json",
+ ),
+}
+
+
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+ "coco_2017_train_panoptic": (
+ # This is the original panoptic annotation directory
+ "coco/panoptic_train2017",
+ "coco/annotations/panoptic_train2017.json",
+ # This directory contains semantic annotations that are
+ # converted from panoptic annotations.
+ # It is used by PanopticFPN.
+ # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+ # to create these directories.
+ "coco/panoptic_stuff_train2017",
+ ),
+ "coco_2017_val_panoptic": (
+ "coco/panoptic_val2017",
+ "coco/annotations/panoptic_val2017.json",
+ "coco/panoptic_stuff_val2017",
+ ),
+ "coco_2017_val_100_panoptic": (
+ "coco/panoptic_val2017_100",
+ "coco/annotations/panoptic_val2017_100.json",
+ "coco/panoptic_stuff_val2017_100",
+ ),
+}
+
+
+def register_all_coco(root):
+ for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
+ for key, (image_root, json_file) in splits_per_dataset.items():
+ # Assume pre-defined datasets live in `./datasets`.
+ register_coco_instances(
+ key,
+ _get_builtin_metadata(dataset_name),
+ os.path.join(root, json_file) if "://" not in json_file else json_file,
+ os.path.join(root, image_root),
+ )
+
+ for (
+ prefix,
+ (panoptic_root, panoptic_json, semantic_root),
+ ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+ prefix_instances = prefix[: -len("_panoptic")]
+ instances_meta = MetadataCatalog.get(prefix_instances)
+ image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+ # The "separated" version of COCO panoptic segmentation dataset,
+ # e.g. used by Panoptic FPN
+ register_coco_panoptic_separated(
+ prefix,
+ _get_builtin_metadata("coco_panoptic_separated"),
+ image_root,
+ os.path.join(root, panoptic_root),
+ os.path.join(root, panoptic_json),
+ os.path.join(root, semantic_root),
+ instances_json,
+ )
+ # The "standard" version of COCO panoptic segmentation dataset,
+ # e.g. used by Panoptic-DeepLab
+ register_coco_panoptic(
+ prefix,
+ _get_builtin_metadata("coco_panoptic_standard"),
+ image_root,
+ os.path.join(root, panoptic_root),
+ os.path.join(root, panoptic_json),
+ instances_json,
+ )
+
+
+# ==== Predefined datasets and splits for LVIS ==========
+
+
+_PREDEFINED_SPLITS_LVIS = {
+ "lvis_v1": {
+ "lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"),
+ "lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"),
+ "lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
+ "lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
+ },
+ "lvis_v0.5": {
+ "lvis_v0.5_train": ("coco/", "lvis/lvis_v0.5_train.json"),
+ "lvis_v0.5_val": ("coco/", "lvis/lvis_v0.5_val.json"),
+ "lvis_v0.5_val_rand_100": ("coco/", "lvis/lvis_v0.5_val_rand_100.json"),
+ "lvis_v0.5_test": ("coco/", "lvis/lvis_v0.5_image_info_test.json"),
+ },
+ "lvis_v0.5_cocofied": {
+ "lvis_v0.5_train_cocofied": ("coco/", "lvis/lvis_v0.5_train_cocofied.json"),
+ "lvis_v0.5_val_cocofied": ("coco/", "lvis/lvis_v0.5_val_cocofied.json"),
+ },
+}
+
+
+def register_all_lvis(root):
+ for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
+ for key, (image_root, json_file) in splits_per_dataset.items():
+ register_lvis_instances(
+ key,
+ get_lvis_instances_meta(dataset_name),
+ os.path.join(root, json_file) if "://" not in json_file else json_file,
+ os.path.join(root, image_root),
+ )
+
+
+# ==== Predefined splits for raw cityscapes images ===========
+_RAW_CITYSCAPES_SPLITS = {
+ "cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train/", "cityscapes/gtFine/train/"),
+ "cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val/", "cityscapes/gtFine/val/"),
+ "cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test/", "cityscapes/gtFine/test/"),
+}
+
+
+def register_all_cityscapes(root):
+ for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items():
+ meta = _get_builtin_metadata("cityscapes")
+ image_dir = os.path.join(root, image_dir)
+ gt_dir = os.path.join(root, gt_dir)
+
+ inst_key = key.format(task="instance_seg")
+ DatasetCatalog.register(
+ inst_key,
+ lambda x=image_dir, y=gt_dir: load_cityscapes_instances(
+ x, y, from_json=True, to_polygons=True
+ ),
+ )
+ MetadataCatalog.get(inst_key).set(
+ image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_instance", **meta
+ )
+
+ sem_key = key.format(task="sem_seg")
+ DatasetCatalog.register(
+ sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
+ )
+ MetadataCatalog.get(sem_key).set(
+ image_dir=image_dir,
+ gt_dir=gt_dir,
+ evaluator_type="cityscapes_sem_seg",
+ ignore_label=255,
+ **meta,
+ )
+
+
+# ==== Predefined splits for PASCAL VOC ===========
+def register_all_pascal_voc(root):
+ SPLITS = [
+ ("voc_2007_trainval", "VOC2007", "trainval"),
+ ("voc_2007_train", "VOC2007", "train"),
+ ("voc_2007_val", "VOC2007", "val"),
+ ("voc_2007_test", "VOC2007", "test"),
+ ("voc_2012_trainval", "VOC2012", "trainval"),
+ ("voc_2012_train", "VOC2012", "train"),
+ ("voc_2012_val", "VOC2012", "val"),
+ ]
+ for name, dirname, split in SPLITS:
+ year = 2007 if "2007" in name else 2012
+ register_pascal_voc(name, os.path.join(root, dirname), split, year)
+ MetadataCatalog.get(name).evaluator_type = "pascal_voc"
+
+
+def register_all_ade20k(root):
+ root = os.path.join(root, "ADEChallengeData2016")
+ for name, dirname in [("train", "training"), ("val", "validation")]:
+ image_dir = os.path.join(root, "images", dirname)
+ gt_dir = os.path.join(root, "annotations_detectron2", dirname)
+ name = f"ade20k_sem_seg_{name}"
+ DatasetCatalog.register(
+ name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
+ )
+ MetadataCatalog.get(name).set(
+ stuff_classes=ADE20K_SEM_SEG_CATEGORIES[:],
+ image_root=image_dir,
+ sem_seg_root=gt_dir,
+ evaluator_type="sem_seg",
+ ignore_label=255,
+ )
+
+
+# True for open source;
+# Internally at fb, we register them elsewhere
+if __name__.endswith(".builtin"):
+ # Assume pre-defined datasets live in `./datasets`.
+ _root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets"))
+ register_all_coco(_root)
+ register_all_lvis(_root)
+ register_all_cityscapes(_root)
+ register_all_cityscapes_panoptic(_root)
+ register_all_pascal_voc(_root)
+ register_all_ade20k(_root)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/builtin_meta.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/builtin_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf2218907e3f285b8fc305cd9c96cdff94f76778
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/builtin_meta.py
@@ -0,0 +1,350 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+Note:
+For your custom dataset, there is no need to hard-code metadata anywhere in the code.
+For example, for COCO-format dataset, metadata will be obtained automatically
+when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
+during loading.
+
+However, we hard-coded metadata for a few common dataset here.
+The only goal is to allow users who don't have these dataset to use pre-trained models.
+Users don't have to download a COCO json (which contains metadata), in order to visualize a
+COCO model (with correct class names and colors).
+"""
+
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+ {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+ {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+ {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+ {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+ {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+ {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+ {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+ {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+ {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+ {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+ {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+ {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+ {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+ {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+ {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+ {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+ {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+ {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+ {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+ {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+ {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+ {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+ {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+ {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+ {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+ {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+ {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+ {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+ {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+ {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+ {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+ {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+ {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+ {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+ {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+ {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+ {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+ {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+ {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+ {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+ {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+ {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+ {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+ {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+ {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+ {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+ {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+ {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+ {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+ {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+ {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+ {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+ {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+ {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+ {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+ {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+ {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+ {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+ {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+ {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+ {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+ {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+ {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+ {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+ {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+ {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+ {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+ {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+ {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+ {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+ {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+ {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+ {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+ {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+ {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+ {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+ {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+ {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+ {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+ {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+ {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+ {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+ {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+ {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+ {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+ {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+ {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+ {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+ {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+ {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+ {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+ {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+ {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+ {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+ {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+ {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+ {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+ {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+ {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+ {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+ {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+ {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+ {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+ {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+ {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+ {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+ {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+ {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+ {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+ {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+ {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+ {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+ {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+ {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+ {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+ {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+ {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+ {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+ {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+ {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+ {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+ {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+ {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+ {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+ {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+ {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+ {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+ {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+ {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+ {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+ {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+ {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+ {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
+]
+
+# fmt: off
+COCO_PERSON_KEYPOINT_NAMES = (
+ "nose",
+ "left_eye", "right_eye",
+ "left_ear", "right_ear",
+ "left_shoulder", "right_shoulder",
+ "left_elbow", "right_elbow",
+ "left_wrist", "right_wrist",
+ "left_hip", "right_hip",
+ "left_knee", "right_knee",
+ "left_ankle", "right_ankle",
+)
+# fmt: on
+
+# Pairs of keypoints that should be exchanged under horizontal flipping
+COCO_PERSON_KEYPOINT_FLIP_MAP = (
+ ("left_eye", "right_eye"),
+ ("left_ear", "right_ear"),
+ ("left_shoulder", "right_shoulder"),
+ ("left_elbow", "right_elbow"),
+ ("left_wrist", "right_wrist"),
+ ("left_hip", "right_hip"),
+ ("left_knee", "right_knee"),
+ ("left_ankle", "right_ankle"),
+)
+
+# rules for pairs of keypoints to draw a line between, and the line color to use.
+KEYPOINT_CONNECTION_RULES = [
+ # face
+ ("left_ear", "left_eye", (102, 204, 255)),
+ ("right_ear", "right_eye", (51, 153, 255)),
+ ("left_eye", "nose", (102, 0, 204)),
+ ("nose", "right_eye", (51, 102, 255)),
+ # upper-body
+ ("left_shoulder", "right_shoulder", (255, 128, 0)),
+ ("left_shoulder", "left_elbow", (153, 255, 204)),
+ ("right_shoulder", "right_elbow", (128, 229, 255)),
+ ("left_elbow", "left_wrist", (153, 255, 153)),
+ ("right_elbow", "right_wrist", (102, 255, 224)),
+ # lower-body
+ ("left_hip", "right_hip", (255, 102, 0)),
+ ("left_hip", "left_knee", (255, 255, 77)),
+ ("right_hip", "right_knee", (153, 255, 204)),
+ ("left_knee", "left_ankle", (191, 255, 128)),
+ ("right_knee", "right_ankle", (255, 195, 77)),
+]
+
+# All Cityscapes categories, together with their nice-looking visualization colors
+# It's from https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py # noqa
+CITYSCAPES_CATEGORIES = [
+ {"color": (128, 64, 128), "isthing": 0, "id": 7, "trainId": 0, "name": "road"},
+ {"color": (244, 35, 232), "isthing": 0, "id": 8, "trainId": 1, "name": "sidewalk"},
+ {"color": (70, 70, 70), "isthing": 0, "id": 11, "trainId": 2, "name": "building"},
+ {"color": (102, 102, 156), "isthing": 0, "id": 12, "trainId": 3, "name": "wall"},
+ {"color": (190, 153, 153), "isthing": 0, "id": 13, "trainId": 4, "name": "fence"},
+ {"color": (153, 153, 153), "isthing": 0, "id": 17, "trainId": 5, "name": "pole"},
+ {"color": (250, 170, 30), "isthing": 0, "id": 19, "trainId": 6, "name": "traffic light"},
+ {"color": (220, 220, 0), "isthing": 0, "id": 20, "trainId": 7, "name": "traffic sign"},
+ {"color": (107, 142, 35), "isthing": 0, "id": 21, "trainId": 8, "name": "vegetation"},
+ {"color": (152, 251, 152), "isthing": 0, "id": 22, "trainId": 9, "name": "terrain"},
+ {"color": (70, 130, 180), "isthing": 0, "id": 23, "trainId": 10, "name": "sky"},
+ {"color": (220, 20, 60), "isthing": 1, "id": 24, "trainId": 11, "name": "person"},
+ {"color": (255, 0, 0), "isthing": 1, "id": 25, "trainId": 12, "name": "rider"},
+ {"color": (0, 0, 142), "isthing": 1, "id": 26, "trainId": 13, "name": "car"},
+ {"color": (0, 0, 70), "isthing": 1, "id": 27, "trainId": 14, "name": "truck"},
+ {"color": (0, 60, 100), "isthing": 1, "id": 28, "trainId": 15, "name": "bus"},
+ {"color": (0, 80, 100), "isthing": 1, "id": 31, "trainId": 16, "name": "train"},
+ {"color": (0, 0, 230), "isthing": 1, "id": 32, "trainId": 17, "name": "motorcycle"},
+ {"color": (119, 11, 32), "isthing": 1, "id": 33, "trainId": 18, "name": "bicycle"},
+]
+
+# fmt: off
+ADE20K_SEM_SEG_CATEGORIES = [
+ "wall", "building", "sky", "floor", "tree", "ceiling", "road, route", "bed", "window ", "grass", "cabinet", "sidewalk, pavement", "person", "earth, ground", "door", "table", "mountain, mount", "plant", "curtain", "chair", "car", "water", "painting, picture", "sofa", "shelf", "house", "sea", "mirror", "rug", "field", "armchair", "seat", "fence", "desk", "rock, stone", "wardrobe, closet, press", "lamp", "tub", "rail", "cushion", "base, pedestal, stand", "box", "column, pillar", "signboard, sign", "chest of drawers, chest, bureau, dresser", "counter", "sand", "sink", "skyscraper", "fireplace", "refrigerator, icebox", "grandstand, covered stand", "path", "stairs", "runway", "case, display case, showcase, vitrine", "pool table, billiard table, snooker table", "pillow", "screen door, screen", "stairway, staircase", "river", "bridge, span", "bookcase", "blind, screen", "coffee table", "toilet, can, commode, crapper, pot, potty, stool, throne", "flower", "book", "hill", "bench", "countertop", "stove", "palm, palm tree", "kitchen island", "computer", "swivel chair", "boat", "bar", "arcade machine", "hovel, hut, hutch, shack, shanty", "bus", "towel", "light", "truck", "tower", "chandelier", "awning, sunshade, sunblind", "street lamp", "booth", "tv", "plane", "dirt track", "clothes", "pole", "land, ground, soil", "bannister, banister, balustrade, balusters, handrail", "escalator, moving staircase, moving stairway", "ottoman, pouf, pouffe, puff, hassock", "bottle", "buffet, counter, sideboard", "poster, posting, placard, notice, bill, card", "stage", "van", "ship", "fountain", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "canopy", "washer, automatic washer, washing machine", "plaything, toy", "pool", "stool", "barrel, cask", "basket, handbasket", "falls", "tent", "bag", "minibike, motorbike", "cradle", "oven", "ball", "food, solid food", "step, stair", "tank, storage tank", "trade name", "microwave", "pot", "animal", "bicycle", "lake", "dishwasher", "screen", "blanket, cover", "sculpture", "hood, exhaust hood", "sconce", "vase", "traffic light", "tray", "trash can", "fan", "pier", "crt screen", "plate", "monitor", "bulletin board", "shower", "radiator", "glass, drinking glass", "clock", "flag", # noqa
+]
+# After processed by `prepare_ade20k_sem_seg.py`, id 255 means ignore
+# fmt: on
+
+
+def _get_coco_instances_meta():
+ thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+ thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+ assert len(thing_ids) == 80, len(thing_ids)
+ # Mapping from the incontiguous COCO category id to an id in [0, 79]
+ thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+ thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+ ret = {
+ "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+ "thing_classes": thing_classes,
+ "thing_colors": thing_colors,
+ }
+ return ret
+
+
+def _get_coco_panoptic_separated_meta():
+ """
+ Returns metadata for "separated" version of the panoptic segmentation dataset.
+ """
+ stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+ assert len(stuff_ids) == 53, len(stuff_ids)
+
+ # For semantic segmentation, this mapping maps from contiguous stuff id
+ # (in [0, 53], used in models) to ids in the dataset (used for processing results)
+ # The id 0 is mapped to an extra category "thing".
+ stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)}
+ # When converting COCO panoptic annotations to semantic annotations
+ # We label the "thing" category to 0
+ stuff_dataset_id_to_contiguous_id[0] = 0
+
+ # 54 names for COCO stuff categories (including "things")
+ stuff_classes = ["things"] + [
+ k["name"].replace("-other", "").replace("-merged", "")
+ for k in COCO_CATEGORIES
+ if k["isthing"] == 0
+ ]
+
+ # NOTE: I randomly picked a color for things
+ stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+ ret = {
+ "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+ "stuff_classes": stuff_classes,
+ "stuff_colors": stuff_colors,
+ }
+ ret.update(_get_coco_instances_meta())
+ return ret
+
+
+def _get_builtin_metadata(dataset_name):
+ if dataset_name == "coco":
+ return _get_coco_instances_meta()
+ if dataset_name == "coco_panoptic_separated":
+ return _get_coco_panoptic_separated_meta()
+ elif dataset_name == "coco_panoptic_standard":
+ meta = {}
+ # The following metadata maps contiguous id from [0, #thing categories +
+ # #stuff categories) to their names and colors. We have to replica of the
+ # same name and color under "thing_*" and "stuff_*" because the current
+ # visualization function in D2 handles thing and class classes differently
+ # due to some heuristic used in Panoptic FPN. We keep the same naming to
+ # enable reusing existing visualization functions.
+ thing_classes = [k["name"] for k in COCO_CATEGORIES]
+ thing_colors = [k["color"] for k in COCO_CATEGORIES]
+ stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+ stuff_colors = [k["color"] for k in COCO_CATEGORIES]
+
+ meta["thing_classes"] = thing_classes
+ meta["thing_colors"] = thing_colors
+ meta["stuff_classes"] = stuff_classes
+ meta["stuff_colors"] = stuff_colors
+
+ # Convert category id for training:
+ # category id: like semantic segmentation, it is the class id for each
+ # pixel. Since there are some classes not used in evaluation, the category
+ # id is not always contiguous and thus we have two set of category ids:
+ # - original category id: category id in the original dataset, mainly
+ # used for evaluation.
+ # - contiguous category id: [0, #classes), in order to train the linear
+ # softmax classifier.
+ thing_dataset_id_to_contiguous_id = {}
+ stuff_dataset_id_to_contiguous_id = {}
+
+ for i, cat in enumerate(COCO_CATEGORIES):
+ if cat["isthing"]:
+ thing_dataset_id_to_contiguous_id[cat["id"]] = i
+ else:
+ stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+ meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+ meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+ return meta
+ elif dataset_name == "coco_person":
+ return {
+ "thing_classes": ["person"],
+ "keypoint_names": COCO_PERSON_KEYPOINT_NAMES,
+ "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP,
+ "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES,
+ }
+ elif dataset_name == "cityscapes":
+ # fmt: off
+ CITYSCAPES_THING_CLASSES = [
+ "person", "rider", "car", "truck",
+ "bus", "train", "motorcycle", "bicycle",
+ ]
+ CITYSCAPES_STUFF_CLASSES = [
+ "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light",
+ "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car",
+ "truck", "bus", "train", "motorcycle", "bicycle",
+ ]
+ # fmt: on
+ return {
+ "thing_classes": CITYSCAPES_THING_CLASSES,
+ "stuff_classes": CITYSCAPES_STUFF_CLASSES,
+ }
+ raise KeyError("No built-in metadata for dataset {}".format(dataset_name))
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/cityscapes.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0d0a88fa1e6aa7bf1ef3c63be89756d4fddc2c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/cityscapes.py
@@ -0,0 +1,329 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import functools
+import json
+import logging
+import multiprocessing as mp
+import numpy as np
+import os
+from itertools import chain
+import custom_pycocotools.mask as mask_util
+from PIL import Image
+
+from custom_detectron2.structures import BoxMode
+from custom_detectron2.utils.comm import get_world_size
+from custom_detectron2.utils.file_io import PathManager
+from custom_detectron2.utils.logger import setup_logger
+
+try:
+ import cv2 # noqa
+except ImportError:
+ # OpenCV is an optional dependency at the moment
+ pass
+
+
+logger = logging.getLogger(__name__)
+
+
+def _get_cityscapes_files(image_dir, gt_dir):
+ files = []
+ # scan through the directory
+ cities = PathManager.ls(image_dir)
+ logger.info(f"{len(cities)} cities found in '{image_dir}'.")
+ for city in cities:
+ city_img_dir = os.path.join(image_dir, city)
+ city_gt_dir = os.path.join(gt_dir, city)
+ for basename in PathManager.ls(city_img_dir):
+ image_file = os.path.join(city_img_dir, basename)
+
+ suffix = "leftImg8bit.png"
+ assert basename.endswith(suffix), basename
+ basename = basename[: -len(suffix)]
+
+ instance_file = os.path.join(city_gt_dir, basename + "gtFine_instanceIds.png")
+ label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png")
+ json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json")
+
+ files.append((image_file, instance_file, label_file, json_file))
+ assert len(files), "No images found in {}".format(image_dir)
+ for f in files[0]:
+ assert PathManager.isfile(f), f
+ return files
+
+
+def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True):
+ """
+ Args:
+ image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+ gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+ from_json (bool): whether to read annotations from the raw json file or the png files.
+ to_polygons (bool): whether to represent the segmentation as polygons
+ (COCO's format) instead of masks (cityscapes's format).
+
+ Returns:
+ list[dict]: a list of dicts in Detectron2 standard format. (See
+ `Using Custom Datasets `_ )
+ """
+ if from_json:
+ assert to_polygons, (
+ "Cityscapes's json annotations are in polygon format. "
+ "Converting to mask format is not supported now."
+ )
+ files = _get_cityscapes_files(image_dir, gt_dir)
+
+ logger.info("Preprocessing cityscapes annotations ...")
+ # This is still not fast: all workers will execute duplicate works and will
+ # take up to 10m on a 8GPU server.
+ pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4))
+
+ ret = pool.map(
+ functools.partial(_cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons),
+ files,
+ )
+ logger.info("Loaded {} images from {}".format(len(ret), image_dir))
+
+ # Map cityscape ids to contiguous ids
+ from cityscapesscripts.helpers.labels import labels
+
+ labels = [l for l in labels if l.hasInstances and not l.ignoreInEval]
+ dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)}
+ for dict_per_image in ret:
+ for anno in dict_per_image["annotations"]:
+ anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]]
+ return ret
+
+
+def load_cityscapes_semantic(image_dir, gt_dir):
+ """
+ Args:
+ image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+ gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+
+ Returns:
+ list[dict]: a list of dict, each has "file_name" and
+ "sem_seg_file_name".
+ """
+ ret = []
+ # gt_dir is small and contain many small files. make sense to fetch to local first
+ gt_dir = PathManager.get_local_path(gt_dir)
+ for image_file, _, label_file, json_file in _get_cityscapes_files(image_dir, gt_dir):
+ label_file = label_file.replace("labelIds", "labelTrainIds")
+
+ with PathManager.open(json_file, "r") as f:
+ jsonobj = json.load(f)
+ ret.append(
+ {
+ "file_name": image_file,
+ "sem_seg_file_name": label_file,
+ "height": jsonobj["imgHeight"],
+ "width": jsonobj["imgWidth"],
+ }
+ )
+ assert len(ret), f"No images found in {image_dir}!"
+ assert PathManager.isfile(
+ ret[0]["sem_seg_file_name"]
+ ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa
+ return ret
+
+
+def _cityscapes_files_to_dict(files, from_json, to_polygons):
+ """
+ Parse cityscapes annotation files to a instance segmentation dataset dict.
+
+ Args:
+ files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file)
+ from_json (bool): whether to read annotations from the raw json file or the png files.
+ to_polygons (bool): whether to represent the segmentation as polygons
+ (COCO's format) instead of masks (cityscapes's format).
+
+ Returns:
+ A dict in Detectron2 Dataset format.
+ """
+ from cityscapesscripts.helpers.labels import id2label, name2label
+
+ image_file, instance_id_file, _, json_file = files
+
+ annos = []
+
+ if from_json:
+ from shapely.geometry import MultiPolygon, Polygon
+
+ with PathManager.open(json_file, "r") as f:
+ jsonobj = json.load(f)
+ ret = {
+ "file_name": image_file,
+ "image_id": os.path.basename(image_file),
+ "height": jsonobj["imgHeight"],
+ "width": jsonobj["imgWidth"],
+ }
+
+ # `polygons_union` contains the union of all valid polygons.
+ polygons_union = Polygon()
+
+ # CityscapesScripts draw the polygons in sequential order
+ # and each polygon *overwrites* existing ones. See
+ # (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa
+ # We use reverse order, and each polygon *avoids* early ones.
+ # This will resolve the ploygon overlaps in the same way as CityscapesScripts.
+ for obj in jsonobj["objects"][::-1]:
+ if "deleted" in obj: # cityscapes data format specific
+ continue
+ label_name = obj["label"]
+
+ try:
+ label = name2label[label_name]
+ except KeyError:
+ if label_name.endswith("group"): # crowd area
+ label = name2label[label_name[: -len("group")]]
+ else:
+ raise
+ if label.id < 0: # cityscapes data format
+ continue
+
+ # Cityscapes's raw annotations uses integer coordinates
+ # Therefore +0.5 here
+ poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5
+ # CityscapesScript uses PIL.ImageDraw.polygon to rasterize
+ # polygons for evaluation. This function operates in integer space
+ # and draws each pixel whose center falls into the polygon.
+ # Therefore it draws a polygon which is 0.5 "fatter" in expectation.
+ # We therefore dilate the input polygon by 0.5 as our input.
+ poly = Polygon(poly_coord).buffer(0.5, resolution=4)
+
+ if not label.hasInstances or label.ignoreInEval:
+ # even if we won't store the polygon it still contributes to overlaps resolution
+ polygons_union = polygons_union.union(poly)
+ continue
+
+ # Take non-overlapping part of the polygon
+ poly_wo_overlaps = poly.difference(polygons_union)
+ if poly_wo_overlaps.is_empty:
+ continue
+ polygons_union = polygons_union.union(poly)
+
+ anno = {}
+ anno["iscrowd"] = label_name.endswith("group")
+ anno["category_id"] = label.id
+
+ if isinstance(poly_wo_overlaps, Polygon):
+ poly_list = [poly_wo_overlaps]
+ elif isinstance(poly_wo_overlaps, MultiPolygon):
+ poly_list = poly_wo_overlaps.geoms
+ else:
+ raise NotImplementedError("Unknown geometric structure {}".format(poly_wo_overlaps))
+
+ poly_coord = []
+ for poly_el in poly_list:
+ # COCO API can work only with exterior boundaries now, hence we store only them.
+ # TODO: store both exterior and interior boundaries once other parts of the
+ # codebase support holes in polygons.
+ poly_coord.append(list(chain(*poly_el.exterior.coords)))
+ anno["segmentation"] = poly_coord
+ (xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds
+
+ anno["bbox"] = (xmin, ymin, xmax, ymax)
+ anno["bbox_mode"] = BoxMode.XYXY_ABS
+
+ annos.append(anno)
+ else:
+ # See also the official annotation parsing scripts at
+ # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py # noqa
+ with PathManager.open(instance_id_file, "rb") as f:
+ inst_image = np.asarray(Image.open(f), order="F")
+ # ids < 24 are stuff labels (filtering them first is about 5% faster)
+ flattened_ids = np.unique(inst_image[inst_image >= 24])
+
+ ret = {
+ "file_name": image_file,
+ "image_id": os.path.basename(image_file),
+ "height": inst_image.shape[0],
+ "width": inst_image.shape[1],
+ }
+
+ for instance_id in flattened_ids:
+ # For non-crowd annotations, instance_id // 1000 is the label_id
+ # Crowd annotations have <1000 instance ids
+ label_id = instance_id // 1000 if instance_id >= 1000 else instance_id
+ label = id2label[label_id]
+ if not label.hasInstances or label.ignoreInEval:
+ continue
+
+ anno = {}
+ anno["iscrowd"] = instance_id < 1000
+ anno["category_id"] = label.id
+
+ mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F")
+
+ inds = np.nonzero(mask)
+ ymin, ymax = inds[0].min(), inds[0].max()
+ xmin, xmax = inds[1].min(), inds[1].max()
+ anno["bbox"] = (xmin, ymin, xmax, ymax)
+ if xmax <= xmin or ymax <= ymin:
+ continue
+ anno["bbox_mode"] = BoxMode.XYXY_ABS
+ if to_polygons:
+ # This conversion comes from D4809743 and D5171122,
+ # when Mask-RCNN was first developed.
+ contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[
+ -2
+ ]
+ polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3]
+ # opencv's can produce invalid polygons
+ if len(polygons) == 0:
+ continue
+ anno["segmentation"] = polygons
+ else:
+ anno["segmentation"] = mask_util.encode(mask[:, :, None])[0]
+ annos.append(anno)
+ ret["annotations"] = annos
+ return ret
+
+
+if __name__ == "__main__":
+ """
+ Test the cityscapes dataset loader.
+
+ Usage:
+ python -m detectron2.data.datasets.cityscapes \
+ cityscapes/leftImg8bit/train cityscapes/gtFine/train
+ """
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("image_dir")
+ parser.add_argument("gt_dir")
+ parser.add_argument("--type", choices=["instance", "semantic"], default="instance")
+ args = parser.parse_args()
+ from custom_detectron2.data.catalog import Metadata
+ from custom_detectron2.utils.visualizer import Visualizer
+ from cityscapesscripts.helpers.labels import labels
+
+ logger = setup_logger(name=__name__)
+
+ dirname = "cityscapes-data-vis"
+ os.makedirs(dirname, exist_ok=True)
+
+ if args.type == "instance":
+ dicts = load_cityscapes_instances(
+ args.image_dir, args.gt_dir, from_json=True, to_polygons=True
+ )
+ logger.info("Done loading {} samples.".format(len(dicts)))
+
+ thing_classes = [k.name for k in labels if k.hasInstances and not k.ignoreInEval]
+ meta = Metadata().set(thing_classes=thing_classes)
+
+ else:
+ dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir)
+ logger.info("Done loading {} samples.".format(len(dicts)))
+
+ stuff_classes = [k.name for k in labels if k.trainId != 255]
+ stuff_colors = [k.color for k in labels if k.trainId != 255]
+ meta = Metadata().set(stuff_classes=stuff_classes, stuff_colors=stuff_colors)
+
+ for d in dicts:
+ img = np.array(Image.open(PathManager.open(d["file_name"], "rb")))
+ visualizer = Visualizer(img, metadata=meta)
+ vis = visualizer.draw_dataset_dict(d)
+ # cv2.imshow("a", vis.get_image()[:, :, ::-1])
+ # cv2.waitKey()
+ fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+ vis.save(fpath)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/cityscapes_panoptic.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/cityscapes_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb6035b0afaa8357a410d1a2cdeb34796dafecb6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/cityscapes_panoptic.py
@@ -0,0 +1,187 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import json
+import logging
+import os
+
+from custom_detectron2.data import DatasetCatalog, MetadataCatalog
+from custom_detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES
+from custom_detectron2.utils.file_io import PathManager
+
+"""
+This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog.
+"""
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info):
+ files = []
+ # scan through the directory
+ cities = PathManager.ls(image_dir)
+ logger.info(f"{len(cities)} cities found in '{image_dir}'.")
+ image_dict = {}
+ for city in cities:
+ city_img_dir = os.path.join(image_dir, city)
+ for basename in PathManager.ls(city_img_dir):
+ image_file = os.path.join(city_img_dir, basename)
+
+ suffix = "_leftImg8bit.png"
+ assert basename.endswith(suffix), basename
+ basename = os.path.basename(basename)[: -len(suffix)]
+
+ image_dict[basename] = image_file
+
+ for ann in json_info["annotations"]:
+ image_file = image_dict.get(ann["image_id"], None)
+ assert image_file is not None, "No image {} found for annotation {}".format(
+ ann["image_id"], ann["file_name"]
+ )
+ label_file = os.path.join(gt_dir, ann["file_name"])
+ segments_info = ann["segments_info"]
+
+ files.append((image_file, label_file, segments_info))
+
+ assert len(files), "No images found in {}".format(image_dir)
+ assert PathManager.isfile(files[0][0]), files[0][0]
+ assert PathManager.isfile(files[0][1]), files[0][1]
+ return files
+
+
+def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta):
+ """
+ Args:
+ image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+ gt_dir (str): path to the raw annotations. e.g.,
+ "~/cityscapes/gtFine/cityscapes_panoptic_train".
+ gt_json (str): path to the json file. e.g.,
+ "~/cityscapes/gtFine/cityscapes_panoptic_train.json".
+ meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id"
+ and "stuff_dataset_id_to_contiguous_id" to map category ids to
+ contiguous ids for training.
+
+ Returns:
+ list[dict]: a list of dicts in Detectron2 standard format. (See
+ `Using Custom Datasets `_ )
+ """
+
+ def _convert_category_id(segment_info, meta):
+ if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+ segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+ segment_info["category_id"]
+ ]
+ else:
+ segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+ segment_info["category_id"]
+ ]
+ return segment_info
+
+ assert os.path.exists(
+ gt_json
+ ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files." # noqa
+ with open(gt_json) as f:
+ json_info = json.load(f)
+ files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info)
+ ret = []
+ for image_file, label_file, segments_info in files:
+ sem_label_file = (
+ image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png"
+ )
+ segments_info = [_convert_category_id(x, meta) for x in segments_info]
+ ret.append(
+ {
+ "file_name": image_file,
+ "image_id": "_".join(
+ os.path.splitext(os.path.basename(image_file))[0].split("_")[:3]
+ ),
+ "sem_seg_file_name": sem_label_file,
+ "pan_seg_file_name": label_file,
+ "segments_info": segments_info,
+ }
+ )
+ assert len(ret), f"No images found in {image_dir}!"
+ assert PathManager.isfile(
+ ret[0]["sem_seg_file_name"]
+ ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa
+ assert PathManager.isfile(
+ ret[0]["pan_seg_file_name"]
+ ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py" # noqa
+ return ret
+
+
+_RAW_CITYSCAPES_PANOPTIC_SPLITS = {
+ "cityscapes_fine_panoptic_train": (
+ "cityscapes/leftImg8bit/train",
+ "cityscapes/gtFine/cityscapes_panoptic_train",
+ "cityscapes/gtFine/cityscapes_panoptic_train.json",
+ ),
+ "cityscapes_fine_panoptic_val": (
+ "cityscapes/leftImg8bit/val",
+ "cityscapes/gtFine/cityscapes_panoptic_val",
+ "cityscapes/gtFine/cityscapes_panoptic_val.json",
+ ),
+ # "cityscapes_fine_panoptic_test": not supported yet
+}
+
+
+def register_all_cityscapes_panoptic(root):
+ meta = {}
+ # The following metadata maps contiguous id from [0, #thing categories +
+ # #stuff categories) to their names and colors. We have to replica of the
+ # same name and color under "thing_*" and "stuff_*" because the current
+ # visualization function in D2 handles thing and class classes differently
+ # due to some heuristic used in Panoptic FPN. We keep the same naming to
+ # enable reusing existing visualization functions.
+ thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
+ thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
+ stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
+ stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
+
+ meta["thing_classes"] = thing_classes
+ meta["thing_colors"] = thing_colors
+ meta["stuff_classes"] = stuff_classes
+ meta["stuff_colors"] = stuff_colors
+
+ # There are three types of ids in cityscapes panoptic segmentation:
+ # (1) category id: like semantic segmentation, it is the class id for each
+ # pixel. Since there are some classes not used in evaluation, the category
+ # id is not always contiguous and thus we have two set of category ids:
+ # - original category id: category id in the original dataset, mainly
+ # used for evaluation.
+ # - contiguous category id: [0, #classes), in order to train the classifier
+ # (2) instance id: this id is used to differentiate different instances from
+ # the same category. For "stuff" classes, the instance id is always 0; for
+ # "thing" classes, the instance id starts from 1 and 0 is reserved for
+ # ignored instances (e.g. crowd annotation).
+ # (3) panoptic id: this is the compact id that encode both category and
+ # instance id by: category_id * 1000 + instance_id.
+ thing_dataset_id_to_contiguous_id = {}
+ stuff_dataset_id_to_contiguous_id = {}
+
+ for k in CITYSCAPES_CATEGORIES:
+ if k["isthing"] == 1:
+ thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
+ else:
+ stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
+
+ meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+ meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+ for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items():
+ image_dir = os.path.join(root, image_dir)
+ gt_dir = os.path.join(root, gt_dir)
+ gt_json = os.path.join(root, gt_json)
+
+ DatasetCatalog.register(
+ key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta)
+ )
+ MetadataCatalog.get(key).set(
+ panoptic_root=gt_dir,
+ image_root=image_dir,
+ panoptic_json=gt_json,
+ gt_dir=gt_dir.replace("cityscapes_panoptic_", ""),
+ evaluator_type="cityscapes_panoptic_seg",
+ ignore_label=255,
+ label_divisor=1000,
+ **meta,
+ )
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/coco.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a769ba88f8aa422a012c1317ad3a61647b0e0cca
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/coco.py
@@ -0,0 +1,539 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import datetime
+import io
+import json
+import logging
+import numpy as np
+import os
+import shutil
+import custom_pycocotools.mask as mask_util
+from fvcore.common.timer import Timer
+from iopath.common.file_io import file_lock
+from PIL import Image
+
+from custom_detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
+from custom_detectron2.utils.file_io import PathManager
+
+from .. import DatasetCatalog, MetadataCatalog
+
+"""
+This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format".
+"""
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_coco_json", "load_sem_seg", "convert_to_coco_json", "register_coco_instances"]
+
+
+def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
+ """
+ Load a json file with COCO's instances annotation format.
+ Currently supports instance detection, instance segmentation,
+ and person keypoints annotations.
+
+ Args:
+ json_file (str): full path to the json file in COCO instances annotation format.
+ image_root (str or path-like): the directory where the images in this json file exists.
+ dataset_name (str or None): the name of the dataset (e.g., coco_2017_train).
+ When provided, this function will also do the following:
+
+ * Put "thing_classes" into the metadata associated with this dataset.
+ * Map the category ids into a contiguous range (needed by standard dataset format),
+ and add "thing_dataset_id_to_contiguous_id" to the metadata associated
+ with this dataset.
+
+ This option should usually be provided, unless users need to load
+ the original json content and apply more processing manually.
+ extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+ loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
+ "category_id", "segmentation"). The values for these keys will be returned as-is.
+ For example, the densepose annotations are loaded in this way.
+
+ Returns:
+ list[dict]: a list of dicts in Detectron2 standard dataset dicts format (See
+ `Using Custom Datasets `_ ) when `dataset_name` is not None.
+ If `dataset_name` is None, the returned `category_ids` may be
+ incontiguous and may not conform to the Detectron2 standard format.
+
+ Notes:
+ 1. This function does not read the image files.
+ The results do not have the "image" field.
+ """
+ from custom_pycocotools.coco import COCO
+
+ timer = Timer()
+ json_file = PathManager.get_local_path(json_file)
+ with contextlib.redirect_stdout(io.StringIO()):
+ coco_api = COCO(json_file)
+ if timer.seconds() > 1:
+ logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+ id_map = None
+ if dataset_name is not None:
+ meta = MetadataCatalog.get(dataset_name)
+ cat_ids = sorted(coco_api.getCatIds())
+ cats = coco_api.loadCats(cat_ids)
+ # The categories in a custom json file may not be sorted.
+ thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+ meta.thing_classes = thing_classes
+
+ # In COCO, certain category ids are artificially removed,
+ # and by convention they are always ignored.
+ # We deal with COCO's id issue and translate
+ # the category ids to contiguous ids in [0, 80).
+
+ # It works by looking at the "categories" field in the json, therefore
+ # if users' own json also have incontiguous ids, we'll
+ # apply this mapping as well but print a warning.
+ if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+ if "coco" not in dataset_name:
+ logger.warning(
+ """
+Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+"""
+ )
+ id_map = {v: i for i, v in enumerate(cat_ids)}
+ meta.thing_dataset_id_to_contiguous_id = id_map
+
+ # sort indices for reproducible results
+ img_ids = sorted(coco_api.imgs.keys())
+ # imgs is a list of dicts, each looks something like:
+ # {'license': 4,
+ # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+ # 'file_name': 'COCO_val2014_000000001268.jpg',
+ # 'height': 427,
+ # 'width': 640,
+ # 'date_captured': '2013-11-17 05:57:24',
+ # 'id': 1268}
+ imgs = coco_api.loadImgs(img_ids)
+ # anns is a list[list[dict]], where each dict is an annotation
+ # record for an object. The inner list enumerates the objects in an image
+ # and the outer list enumerates over images. Example of anns[0]:
+ # [{'segmentation': [[192.81,
+ # 247.09,
+ # ...
+ # 219.03,
+ # 249.06]],
+ # 'area': 1035.749,
+ # 'iscrowd': 0,
+ # 'image_id': 1268,
+ # 'bbox': [192.81, 224.8, 74.73, 33.43],
+ # 'category_id': 16,
+ # 'id': 42986},
+ # ...]
+ anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+ total_num_valid_anns = sum([len(x) for x in anns])
+ total_num_anns = len(coco_api.anns)
+ if total_num_valid_anns < total_num_anns:
+ logger.warning(
+ f"{json_file} contains {total_num_anns} annotations, but only "
+ f"{total_num_valid_anns} of them match to images in the file."
+ )
+
+ if "minival" not in json_file:
+ # The popular valminusminival & minival annotations for COCO2014 contain this bug.
+ # However the ratio of buggy annotations there is tiny and does not affect accuracy.
+ # Therefore we explicitly white-list them.
+ ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+ assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+ json_file
+ )
+
+ imgs_anns = list(zip(imgs, anns))
+ logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
+
+ dataset_dicts = []
+
+ ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or [])
+
+ num_instances_without_valid_segmentation = 0
+
+ for (img_dict, anno_dict_list) in imgs_anns:
+ record = {}
+ record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+ record["height"] = img_dict["height"]
+ record["width"] = img_dict["width"]
+ image_id = record["image_id"] = img_dict["id"]
+
+ objs = []
+ for anno in anno_dict_list:
+ # Check that the image_id in this annotation is the same as
+ # the image_id we're looking at.
+ # This fails only when the data parsing logic or the annotation file is buggy.
+
+ # The original COCO valminusminival2014 & minival2014 annotation files
+ # actually contains bugs that, together with certain ways of using COCO API,
+ # can trigger this assertion.
+ assert anno["image_id"] == image_id
+
+ assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.'
+
+ obj = {key: anno[key] for key in ann_keys if key in anno}
+ if "bbox" in obj and len(obj["bbox"]) == 0:
+ raise ValueError(
+ f"One annotation of image {image_id} contains empty 'bbox' value! "
+ "This json does not have valid COCO format."
+ )
+
+ segm = anno.get("segmentation", None)
+ if segm: # either list[list[float]] or dict(RLE)
+ if isinstance(segm, dict):
+ if isinstance(segm["counts"], list):
+ # convert to compressed RLE
+ segm = mask_util.frPyObjects(segm, *segm["size"])
+ else:
+ # filter out invalid polygons (< 3 points)
+ segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+ if len(segm) == 0:
+ num_instances_without_valid_segmentation += 1
+ continue # ignore this instance
+ obj["segmentation"] = segm
+
+ keypts = anno.get("keypoints", None)
+ if keypts: # list[int]
+ for idx, v in enumerate(keypts):
+ if idx % 3 != 2:
+ # COCO's segmentation coordinates are floating points in [0, H or W],
+ # but keypoint coordinates are integers in [0, H-1 or W-1]
+ # Therefore we assume the coordinates are "pixel indices" and
+ # add 0.5 to convert to floating point coordinates.
+ keypts[idx] = v + 0.5
+ obj["keypoints"] = keypts
+
+ obj["bbox_mode"] = BoxMode.XYWH_ABS
+ if id_map:
+ annotation_category_id = obj["category_id"]
+ try:
+ obj["category_id"] = id_map[annotation_category_id]
+ except KeyError as e:
+ raise KeyError(
+ f"Encountered category_id={annotation_category_id} "
+ "but this id does not exist in 'categories' of the json file."
+ ) from e
+ objs.append(obj)
+ record["annotations"] = objs
+ dataset_dicts.append(record)
+
+ if num_instances_without_valid_segmentation > 0:
+ logger.warning(
+ "Filtered out {} instances without valid segmentation. ".format(
+ num_instances_without_valid_segmentation
+ )
+ + "There might be issues in your dataset generation process. Please "
+ "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully"
+ )
+ return dataset_dicts
+
+
+def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
+ """
+ Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are
+ treated as ground truth annotations and all files under "image_root" with "image_ext" extension
+ as input images. Ground truth and input images are matched using file paths relative to
+ "gt_root" and "image_root" respectively without taking into account file extensions.
+ This works for COCO as well as some other datasets.
+
+ Args:
+ gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation
+ annotations are stored as images with integer values in pixels that represent
+ corresponding semantic labels.
+ image_root (str): the directory where the input images are.
+ gt_ext (str): file extension for ground truth annotations.
+ image_ext (str): file extension for input images.
+
+ Returns:
+ list[dict]:
+ a list of dicts in detectron2 standard format without instance-level
+ annotation.
+
+ Notes:
+ 1. This function does not read the image and ground truth files.
+ The results do not have the "image" and "sem_seg" fields.
+ """
+
+ # We match input images with ground truth based on their relative filepaths (without file
+ # extensions) starting from 'image_root' and 'gt_root' respectively.
+ def file2id(folder_path, file_path):
+ # extract relative path starting from `folder_path`
+ image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
+ # remove file extension
+ image_id = os.path.splitext(image_id)[0]
+ return image_id
+
+ input_files = sorted(
+ (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
+ key=lambda file_path: file2id(image_root, file_path),
+ )
+ gt_files = sorted(
+ (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
+ key=lambda file_path: file2id(gt_root, file_path),
+ )
+
+ assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
+
+ # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
+ if len(input_files) != len(gt_files):
+ logger.warn(
+ "Directory {} and {} has {} and {} files, respectively.".format(
+ image_root, gt_root, len(input_files), len(gt_files)
+ )
+ )
+ input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
+ gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
+ intersect = list(set(input_basenames) & set(gt_basenames))
+ # sort, otherwise each worker may obtain a list[dict] in different order
+ intersect = sorted(intersect)
+ logger.warn("Will use their intersection of {} files.".format(len(intersect)))
+ input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
+ gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
+
+ logger.info(
+ "Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root)
+ )
+
+ dataset_dicts = []
+ for (img_path, gt_path) in zip(input_files, gt_files):
+ record = {}
+ record["file_name"] = img_path
+ record["sem_seg_file_name"] = gt_path
+ dataset_dicts.append(record)
+
+ return dataset_dicts
+
+
+def convert_to_coco_dict(dataset_name):
+ """
+ Convert an instance detection/segmentation or keypoint detection dataset
+ in detectron2's standard format into COCO json format.
+
+ Generic dataset description can be found here:
+ https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset
+
+ COCO data format description can be found here:
+ http://cocodataset.org/#format-data
+
+ Args:
+ dataset_name (str):
+ name of the source dataset
+ Must be registered in DatastCatalog and in detectron2's standard format.
+ Must have corresponding metadata "thing_classes"
+ Returns:
+ coco_dict: serializable dict in COCO json format
+ """
+
+ dataset_dicts = DatasetCatalog.get(dataset_name)
+ metadata = MetadataCatalog.get(dataset_name)
+
+ # unmap the category mapping ids for COCO
+ if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
+ reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()}
+ reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id] # noqa
+ else:
+ reverse_id_mapper = lambda contiguous_id: contiguous_id # noqa
+
+ categories = [
+ {"id": reverse_id_mapper(id), "name": name}
+ for id, name in enumerate(metadata.thing_classes)
+ ]
+
+ logger.info("Converting dataset dicts into COCO format")
+ coco_images = []
+ coco_annotations = []
+
+ for image_id, image_dict in enumerate(dataset_dicts):
+ coco_image = {
+ "id": image_dict.get("image_id", image_id),
+ "width": int(image_dict["width"]),
+ "height": int(image_dict["height"]),
+ "file_name": str(image_dict["file_name"]),
+ }
+ coco_images.append(coco_image)
+
+ anns_per_image = image_dict.get("annotations", [])
+ for annotation in anns_per_image:
+ # create a new dict with only COCO fields
+ coco_annotation = {}
+
+ # COCO requirement: XYWH box format for axis-align and XYWHA for rotated
+ bbox = annotation["bbox"]
+ if isinstance(bbox, np.ndarray):
+ if bbox.ndim != 1:
+ raise ValueError(f"bbox has to be 1-dimensional. Got shape={bbox.shape}.")
+ bbox = bbox.tolist()
+ if len(bbox) not in [4, 5]:
+ raise ValueError(f"bbox has to has length 4 or 5. Got {bbox}.")
+ from_bbox_mode = annotation["bbox_mode"]
+ to_bbox_mode = BoxMode.XYWH_ABS if len(bbox) == 4 else BoxMode.XYWHA_ABS
+ bbox = BoxMode.convert(bbox, from_bbox_mode, to_bbox_mode)
+
+ # COCO requirement: instance area
+ if "segmentation" in annotation:
+ # Computing areas for instances by counting the pixels
+ segmentation = annotation["segmentation"]
+ # TODO: check segmentation type: RLE, BinaryMask or Polygon
+ if isinstance(segmentation, list):
+ polygons = PolygonMasks([segmentation])
+ area = polygons.area()[0].item()
+ elif isinstance(segmentation, dict): # RLE
+ area = mask_util.area(segmentation).item()
+ else:
+ raise TypeError(f"Unknown segmentation type {type(segmentation)}!")
+ else:
+ # Computing areas using bounding boxes
+ if to_bbox_mode == BoxMode.XYWH_ABS:
+ bbox_xy = BoxMode.convert(bbox, to_bbox_mode, BoxMode.XYXY_ABS)
+ area = Boxes([bbox_xy]).area()[0].item()
+ else:
+ area = RotatedBoxes([bbox]).area()[0].item()
+
+ if "keypoints" in annotation:
+ keypoints = annotation["keypoints"] # list[int]
+ for idx, v in enumerate(keypoints):
+ if idx % 3 != 2:
+ # COCO's segmentation coordinates are floating points in [0, H or W],
+ # but keypoint coordinates are integers in [0, H-1 or W-1]
+ # For COCO format consistency we substract 0.5
+ # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163
+ keypoints[idx] = v - 0.5
+ if "num_keypoints" in annotation:
+ num_keypoints = annotation["num_keypoints"]
+ else:
+ num_keypoints = sum(kp > 0 for kp in keypoints[2::3])
+
+ # COCO requirement:
+ # linking annotations to images
+ # "id" field must start with 1
+ coco_annotation["id"] = len(coco_annotations) + 1
+ coco_annotation["image_id"] = coco_image["id"]
+ coco_annotation["bbox"] = [round(float(x), 3) for x in bbox]
+ coco_annotation["area"] = float(area)
+ coco_annotation["iscrowd"] = int(annotation.get("iscrowd", 0))
+ coco_annotation["category_id"] = int(reverse_id_mapper(annotation["category_id"]))
+
+ # Add optional fields
+ if "keypoints" in annotation:
+ coco_annotation["keypoints"] = keypoints
+ coco_annotation["num_keypoints"] = num_keypoints
+
+ if "segmentation" in annotation:
+ seg = coco_annotation["segmentation"] = annotation["segmentation"]
+ if isinstance(seg, dict): # RLE
+ counts = seg["counts"]
+ if not isinstance(counts, str):
+ # make it json-serializable
+ seg["counts"] = counts.decode("ascii")
+
+ coco_annotations.append(coco_annotation)
+
+ logger.info(
+ "Conversion finished, "
+ f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}"
+ )
+
+ info = {
+ "date_created": str(datetime.datetime.now()),
+ "description": "Automatically generated COCO json file for Detectron2.",
+ }
+ coco_dict = {"info": info, "images": coco_images, "categories": categories, "licenses": None}
+ if len(coco_annotations) > 0:
+ coco_dict["annotations"] = coco_annotations
+ return coco_dict
+
+
+def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
+ """
+ Converts dataset into COCO format and saves it to a json file.
+ dataset_name must be registered in DatasetCatalog and in detectron2's standard format.
+
+ Args:
+ dataset_name:
+ reference from the config file to the catalogs
+ must be registered in DatasetCatalog and in detectron2's standard format
+ output_file: path of json file that will be saved to
+ allow_cached: if json file is already present then skip conversion
+ """
+
+ # TODO: The dataset or the conversion script *may* change,
+ # a checksum would be useful for validating the cached data
+
+ PathManager.mkdirs(os.path.dirname(output_file))
+ with file_lock(output_file):
+ if PathManager.exists(output_file) and allow_cached:
+ logger.warning(
+ f"Using previously cached COCO format annotations at '{output_file}'. "
+ "You need to clear the cache file if your dataset has been modified."
+ )
+ else:
+ logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)")
+ coco_dict = convert_to_coco_dict(dataset_name)
+
+ logger.info(f"Caching COCO format annotations at '{output_file}' ...")
+ tmp_file = output_file + ".tmp"
+ with PathManager.open(tmp_file, "w") as f:
+ json.dump(coco_dict, f)
+ shutil.move(tmp_file, output_file)
+
+
+def register_coco_instances(name, metadata, json_file, image_root):
+ """
+ Register a dataset in COCO's json annotation format for
+ instance detection, instance segmentation and keypoint detection.
+ (i.e., Type 1 and 2 in http://cocodataset.org/#format-data.
+ `instances*.json` and `person_keypoints*.json` in the dataset).
+
+ This is an example of how to register a new dataset.
+ You can do something similar to this function, to register new datasets.
+
+ Args:
+ name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+ metadata (dict): extra metadata associated with this dataset. You can
+ leave it as an empty dict.
+ json_file (str): path to the json instance annotation file.
+ image_root (str or path-like): directory which contains all the images.
+ """
+ assert isinstance(name, str), name
+ assert isinstance(json_file, (str, os.PathLike)), json_file
+ assert isinstance(image_root, (str, os.PathLike)), image_root
+ # 1. register a function which returns dicts
+ DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
+
+ # 2. Optionally, add metadata about this dataset,
+ # since they might be useful in evaluation, visualization or logging
+ MetadataCatalog.get(name).set(
+ json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
+ )
+
+
+if __name__ == "__main__":
+ """
+ Test the COCO json dataset loader.
+
+ Usage:
+ python -m detectron2.data.datasets.coco \
+ path/to/json path/to/image_root dataset_name
+
+ "dataset_name" can be "coco_2014_minival_100", or other
+ pre-registered ones
+ """
+ from custom_detectron2.utils.logger import setup_logger
+ from custom_detectron2.utils.visualizer import Visualizer
+ import custom_detectron2.data.datasets # noqa # add pre-defined metadata
+ import sys
+
+ logger = setup_logger(name=__name__)
+ assert sys.argv[3] in DatasetCatalog.list()
+ meta = MetadataCatalog.get(sys.argv[3])
+
+ dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3])
+ logger.info("Done loading {} samples.".format(len(dicts)))
+
+ dirname = "coco-data-vis"
+ os.makedirs(dirname, exist_ok=True)
+ for d in dicts:
+ img = np.array(Image.open(d["file_name"]))
+ visualizer = Visualizer(img, metadata=meta)
+ vis = visualizer.draw_dataset_dict(d)
+ fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+ vis.save(fpath)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/coco_panoptic.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..713633b1719738d72f3a3dc63c0d51fbee6110f8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/coco_panoptic.py
@@ -0,0 +1,228 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import json
+import os
+
+from custom_detectron2.data import DatasetCatalog, MetadataCatalog
+from custom_detectron2.utils.file_io import PathManager
+
+from .coco import load_coco_json, load_sem_seg
+
+__all__ = ["register_coco_panoptic", "register_coco_panoptic_separated"]
+
+
+def load_coco_panoptic_json(json_file, image_dir, gt_dir, meta):
+ """
+ Args:
+ image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+ gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+ json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+
+ Returns:
+ list[dict]: a list of dicts in Detectron2 standard format. (See
+ `Using Custom Datasets `_ )
+ """
+
+ def _convert_category_id(segment_info, meta):
+ if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+ segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+ segment_info["category_id"]
+ ]
+ segment_info["isthing"] = True
+ else:
+ segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+ segment_info["category_id"]
+ ]
+ segment_info["isthing"] = False
+ return segment_info
+
+ with PathManager.open(json_file) as f:
+ json_info = json.load(f)
+
+ ret = []
+ for ann in json_info["annotations"]:
+ image_id = int(ann["image_id"])
+ # TODO: currently we assume image and label has the same filename but
+ # different extension, and images have extension ".jpg" for COCO. Need
+ # to make image extension a user-provided argument if we extend this
+ # function to support other COCO-like datasets.
+ image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+ label_file = os.path.join(gt_dir, ann["file_name"])
+ segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+ ret.append(
+ {
+ "file_name": image_file,
+ "image_id": image_id,
+ "pan_seg_file_name": label_file,
+ "segments_info": segments_info,
+ }
+ )
+ assert len(ret), f"No images found in {image_dir}!"
+ assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+ assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+ return ret
+
+
+def register_coco_panoptic(
+ name, metadata, image_root, panoptic_root, panoptic_json, instances_json=None
+):
+ """
+ Register a "standard" version of COCO panoptic segmentation dataset named `name`.
+ The dictionaries in this registered dataset follows detectron2's standard format.
+ Hence it's called "standard".
+
+ Args:
+ name (str): the name that identifies a dataset,
+ e.g. "coco_2017_train_panoptic"
+ metadata (dict): extra metadata associated with this dataset.
+ image_root (str): directory which contains all the images
+ panoptic_root (str): directory which contains panoptic annotation images in COCO format
+ panoptic_json (str): path to the json panoptic annotation file in COCO format
+ sem_seg_root (none): not used, to be consistent with
+ `register_coco_panoptic_separated`.
+ instances_json (str): path to the json instance annotation file
+ """
+ panoptic_name = name
+ DatasetCatalog.register(
+ panoptic_name,
+ lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, metadata),
+ )
+ MetadataCatalog.get(panoptic_name).set(
+ panoptic_root=panoptic_root,
+ image_root=image_root,
+ panoptic_json=panoptic_json,
+ json_file=instances_json,
+ evaluator_type="coco_panoptic_seg",
+ ignore_label=255,
+ label_divisor=1000,
+ **metadata,
+ )
+
+
+def register_coco_panoptic_separated(
+ name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+ """
+ Register a "separated" version of COCO panoptic segmentation dataset named `name`.
+ The annotations in this registered dataset will contain both instance annotations and
+ semantic annotations, each with its own contiguous ids. Hence it's called "separated".
+
+ It follows the setting used by the PanopticFPN paper:
+
+ 1. The instance annotations directly come from polygons in the COCO
+ instances annotation task, rather than from the masks in the COCO panoptic annotations.
+
+ The two format have small differences:
+ Polygons in the instance annotations may have overlaps.
+ The mask annotations are produced by labeling the overlapped polygons
+ with depth ordering.
+
+ 2. The semantic annotations are converted from panoptic annotations, where
+ all "things" are assigned a semantic id of 0.
+ All semantic categories will therefore have ids in contiguous
+ range [1, #stuff_categories].
+
+ This function will also register a pure semantic segmentation dataset
+ named ``name + '_stuffonly'``.
+
+ Args:
+ name (str): the name that identifies a dataset,
+ e.g. "coco_2017_train_panoptic"
+ metadata (dict): extra metadata associated with this dataset.
+ image_root (str): directory which contains all the images
+ panoptic_root (str): directory which contains panoptic annotation images
+ panoptic_json (str): path to the json panoptic annotation file
+ sem_seg_root (str): directory which contains all the ground truth segmentation annotations.
+ instances_json (str): path to the json instance annotation file
+ """
+ panoptic_name = name + "_separated"
+ DatasetCatalog.register(
+ panoptic_name,
+ lambda: merge_to_panoptic(
+ load_coco_json(instances_json, image_root, panoptic_name),
+ load_sem_seg(sem_seg_root, image_root),
+ ),
+ )
+ MetadataCatalog.get(panoptic_name).set(
+ panoptic_root=panoptic_root,
+ image_root=image_root,
+ panoptic_json=panoptic_json,
+ sem_seg_root=sem_seg_root,
+ json_file=instances_json, # TODO rename
+ evaluator_type="coco_panoptic_seg",
+ ignore_label=255,
+ **metadata,
+ )
+
+ semantic_name = name + "_stuffonly"
+ DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root))
+ MetadataCatalog.get(semantic_name).set(
+ sem_seg_root=sem_seg_root,
+ image_root=image_root,
+ evaluator_type="sem_seg",
+ ignore_label=255,
+ **metadata,
+ )
+
+
+def merge_to_panoptic(detection_dicts, sem_seg_dicts):
+ """
+ Create dataset dicts for panoptic segmentation, by
+ merging two dicts using "file_name" field to match their entries.
+
+ Args:
+ detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation.
+ sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation.
+
+ Returns:
+ list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in
+ both detection_dicts and sem_seg_dicts that correspond to the same image.
+ The function assumes that the same key in different dicts has the same value.
+ """
+ results = []
+ sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts}
+ assert len(sem_seg_file_to_entry) > 0
+
+ for det_dict in detection_dicts:
+ dic = copy.copy(det_dict)
+ dic.update(sem_seg_file_to_entry[dic["file_name"]])
+ results.append(dic)
+ return results
+
+
+if __name__ == "__main__":
+ """
+ Test the COCO panoptic dataset loader.
+
+ Usage:
+ python -m detectron2.data.datasets.coco_panoptic \
+ path/to/image_root path/to/panoptic_root path/to/panoptic_json dataset_name 10
+
+ "dataset_name" can be "coco_2017_train_panoptic", or other
+ pre-registered ones
+ """
+ from custom_detectron2.utils.logger import setup_logger
+ from custom_detectron2.utils.visualizer import Visualizer
+ import custom_detectron2.data.datasets # noqa # add pre-defined metadata
+ import sys
+ from PIL import Image
+ import numpy as np
+
+ logger = setup_logger(name=__name__)
+ assert sys.argv[4] in DatasetCatalog.list()
+ meta = MetadataCatalog.get(sys.argv[4])
+
+ dicts = load_coco_panoptic_json(sys.argv[3], sys.argv[1], sys.argv[2], meta.as_dict())
+ logger.info("Done loading {} samples.".format(len(dicts)))
+
+ dirname = "coco-data-vis"
+ os.makedirs(dirname, exist_ok=True)
+ num_imgs_to_vis = int(sys.argv[5])
+ for i, d in enumerate(dicts):
+ img = np.array(Image.open(d["file_name"]))
+ visualizer = Visualizer(img, metadata=meta)
+ vis = visualizer.draw_dataset_dict(d)
+ fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+ vis.save(fpath)
+ if i + 1 >= num_imgs_to_vis:
+ break
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..824dbf4c7dba1ce96ca456cc0b666ee8190c15e4
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis.py
@@ -0,0 +1,241 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+from fvcore.common.timer import Timer
+
+from custom_detectron2.data import DatasetCatalog, MetadataCatalog
+from custom_detectron2.structures import BoxMode
+from custom_detectron2.utils.file_io import PathManager
+
+from .builtin_meta import _get_coco_instances_meta
+from .lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES
+from .lvis_v1_categories import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
+from .lvis_v1_category_image_count import LVIS_CATEGORY_IMAGE_COUNT as LVIS_V1_CATEGORY_IMAGE_COUNT
+
+"""
+This file contains functions to parse LVIS-format annotations into dicts in the
+"Detectron2 format".
+"""
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"]
+
+
+def register_lvis_instances(name, metadata, json_file, image_root):
+ """
+ Register a dataset in LVIS's json annotation format for instance detection and segmentation.
+
+ Args:
+ name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
+ metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
+ json_file (str): path to the json instance annotation file.
+ image_root (str or path-like): directory which contains all the images.
+ """
+ DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name))
+ MetadataCatalog.get(name).set(
+ json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
+ )
+
+
+def load_lvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
+ """
+ Load a json file in LVIS's annotation format.
+
+ Args:
+ json_file (str): full path to the LVIS json annotation file.
+ image_root (str): the directory where the images in this json file exists.
+ dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train").
+ If provided, this function will put "thing_classes" into the metadata
+ associated with this dataset.
+ extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+ loaded into the dataset dict (besides "bbox", "bbox_mode", "category_id",
+ "segmentation"). The values for these keys will be returned as-is.
+
+ Returns:
+ list[dict]: a list of dicts in Detectron2 standard format. (See
+ `Using Custom Datasets `_ )
+
+ Notes:
+ 1. This function does not read the image files.
+ The results do not have the "image" field.
+ """
+ from lvis import LVIS
+
+ json_file = PathManager.get_local_path(json_file)
+
+ timer = Timer()
+ lvis_api = LVIS(json_file)
+ if timer.seconds() > 1:
+ logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+ if dataset_name is not None:
+ meta = get_lvis_instances_meta(dataset_name)
+ MetadataCatalog.get(dataset_name).set(**meta)
+
+ # sort indices for reproducible results
+ img_ids = sorted(lvis_api.imgs.keys())
+ # imgs is a list of dicts, each looks something like:
+ # {'license': 4,
+ # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+ # 'file_name': 'COCO_val2014_000000001268.jpg',
+ # 'height': 427,
+ # 'width': 640,
+ # 'date_captured': '2013-11-17 05:57:24',
+ # 'id': 1268}
+ imgs = lvis_api.load_imgs(img_ids)
+ # anns is a list[list[dict]], where each dict is an annotation
+ # record for an object. The inner list enumerates the objects in an image
+ # and the outer list enumerates over images. Example of anns[0]:
+ # [{'segmentation': [[192.81,
+ # 247.09,
+ # ...
+ # 219.03,
+ # 249.06]],
+ # 'area': 1035.749,
+ # 'image_id': 1268,
+ # 'bbox': [192.81, 224.8, 74.73, 33.43],
+ # 'category_id': 16,
+ # 'id': 42986},
+ # ...]
+ anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+
+ # Sanity check that each annotation has a unique id
+ ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+ assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format(
+ json_file
+ )
+
+ imgs_anns = list(zip(imgs, anns))
+
+ logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file))
+
+ if extra_annotation_keys:
+ logger.info(
+ "The following extra annotation keys will be loaded: {} ".format(extra_annotation_keys)
+ )
+ else:
+ extra_annotation_keys = []
+
+ def get_file_name(img_root, img_dict):
+ # Determine the path including the split folder ("train2017", "val2017", "test2017") from
+ # the coco_url field. Example:
+ # 'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg'
+ split_folder, file_name = img_dict["coco_url"].split("/")[-2:]
+ return os.path.join(img_root + split_folder, file_name)
+
+ dataset_dicts = []
+
+ for (img_dict, anno_dict_list) in imgs_anns:
+ record = {}
+ record["file_name"] = get_file_name(image_root, img_dict)
+ record["height"] = img_dict["height"]
+ record["width"] = img_dict["width"]
+ record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
+ record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+ image_id = record["image_id"] = img_dict["id"]
+
+ objs = []
+ for anno in anno_dict_list:
+ # Check that the image_id in this annotation is the same as
+ # the image_id we're looking at.
+ # This fails only when the data parsing logic or the annotation file is buggy.
+ assert anno["image_id"] == image_id
+ obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+ # LVIS data loader can be used to load COCO dataset categories. In this case `meta`
+ # variable will have a field with COCO-specific category mapping.
+ if dataset_name is not None and "thing_dataset_id_to_contiguous_id" in meta:
+ obj["category_id"] = meta["thing_dataset_id_to_contiguous_id"][anno["category_id"]]
+ else:
+ obj["category_id"] = anno["category_id"] - 1 # Convert 1-indexed to 0-indexed
+ segm = anno["segmentation"] # list[list[float]]
+ # filter out invalid polygons (< 3 points)
+ valid_segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+ assert len(segm) == len(
+ valid_segm
+ ), "Annotation contains an invalid polygon with < 3 points"
+ assert len(segm) > 0
+ obj["segmentation"] = segm
+ for extra_ann_key in extra_annotation_keys:
+ obj[extra_ann_key] = anno[extra_ann_key]
+ objs.append(obj)
+ record["annotations"] = objs
+ dataset_dicts.append(record)
+
+ return dataset_dicts
+
+
+def get_lvis_instances_meta(dataset_name):
+ """
+ Load LVIS metadata.
+
+ Args:
+ dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5").
+
+ Returns:
+ dict: LVIS metadata with keys: thing_classes
+ """
+ if "cocofied" in dataset_name:
+ return _get_coco_instances_meta()
+ if "v0.5" in dataset_name:
+ return _get_lvis_instances_meta_v0_5()
+ elif "v1" in dataset_name:
+ return _get_lvis_instances_meta_v1()
+ raise ValueError("No built-in metadata for dataset {}".format(dataset_name))
+
+
+def _get_lvis_instances_meta_v0_5():
+ assert len(LVIS_V0_5_CATEGORIES) == 1230
+ cat_ids = [k["id"] for k in LVIS_V0_5_CATEGORIES]
+ assert min(cat_ids) == 1 and max(cat_ids) == len(
+ cat_ids
+ ), "Category ids are not in [1, #categories], as expected"
+ # Ensure that the category list is sorted by id
+ lvis_categories = sorted(LVIS_V0_5_CATEGORIES, key=lambda x: x["id"])
+ thing_classes = [k["synonyms"][0] for k in lvis_categories]
+ meta = {"thing_classes": thing_classes}
+ return meta
+
+
+def _get_lvis_instances_meta_v1():
+ assert len(LVIS_V1_CATEGORIES) == 1203
+ cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
+ assert min(cat_ids) == 1 and max(cat_ids) == len(
+ cat_ids
+ ), "Category ids are not in [1, #categories], as expected"
+ # Ensure that the category list is sorted by id
+ lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"])
+ thing_classes = [k["synonyms"][0] for k in lvis_categories]
+ meta = {"thing_classes": thing_classes, "class_image_count": LVIS_V1_CATEGORY_IMAGE_COUNT}
+ return meta
+
+
+if __name__ == "__main__":
+ """
+ Test the LVIS json dataset loader.
+
+ Usage:
+ python -m detectron2.data.datasets.lvis \
+ path/to/json path/to/image_root dataset_name vis_limit
+ """
+ import sys
+ import numpy as np
+ from custom_detectron2.utils.logger import setup_logger
+ from PIL import Image
+ import custom_detectron2.data.datasets # noqa # add pre-defined metadata
+ from custom_detectron2.utils.visualizer import Visualizer
+
+ logger = setup_logger(name=__name__)
+ meta = MetadataCatalog.get(sys.argv[3])
+
+ dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3])
+ logger.info("Done loading {} samples.".format(len(dicts)))
+
+ dirname = "lvis-data-vis"
+ os.makedirs(dirname, exist_ok=True)
+ for d in dicts[: int(sys.argv[4])]:
+ img = np.array(Image.open(d["file_name"]))
+ visualizer = Visualizer(img, metadata=meta)
+ vis = visualizer.draw_dataset_dict(d)
+ fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+ vis.save(fpath)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v0_5_categories.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v0_5_categories.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ef6043b9f81d8bf1d13e6b18224d36de5fe7a74
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v0_5_categories.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Autogen with
+# with open("lvis_v0.5_val.json", "r") as f:
+# a = json.load(f)
+# c = a["categories"]
+# for x in c:
+# del x["image_count"]
+# del x["instance_count"]
+# LVIS_CATEGORIES = repr(c) + " # noqa"
+
+# fmt: off
+LVIS_CATEGORIES = [{'frequency': 'r', 'id': 1, 'synset': 'acorn.n.01', 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'acorn'}, {'frequency': 'c', 'id': 2, 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'id': 3, 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'id': 4, 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'c', 'id': 5, 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'id': 6, 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'r', 'id': 7, 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'id': 8, 'synset': 'almond.n.02', 'synonyms': ['almond'], 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'id': 9, 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'r', 'id': 10, 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'id': 11, 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'id': 12, 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'id': 13, 'synset': 'apple.n.01', 'synonyms': ['apple'], 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'id': 14, 'synset': 'apple_juice.n.01', 'synonyms': ['apple_juice'], 'def': 'the juice of apples', 'name': 'apple_juice'}, {'frequency': 'r', 'id': 15, 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'id': 16, 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'id': 17, 'synset': 'apron.n.01', 'synonyms': ['apron'], 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'id': 18, 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'c', 'id': 19, 'synset': 'armband.n.02', 'synonyms': ['armband'], 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'id': 20, 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'id': 21, 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'id': 22, 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'id': 23, 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'id': 24, 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'id': 25, 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'id': 26, 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'id': 27, 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'c', 'id': 28, 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'id': 29, 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'id': 30, 'synset': 'awning.n.01', 'synonyms': ['awning'], 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'id': 31, 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'f', 'id': 32, 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'id': 33, 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'id': 34, 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'id': 35, 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'id': 36, 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'id': 37, 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'id': 38, 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'id': 39, 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'id': 40, 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'id': 41, 'synset': 'ball.n.06', 'synonyms': ['ball'], 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'id': 42, 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'id': 43, 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'id': 44, 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'id': 45, 'synset': 'banana.n.02', 'synonyms': ['banana'], 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'r', 'id': 46, 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'id': 47, 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'c', 'id': 48, 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'id': 49, 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'id': 50, 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'id': 51, 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'id': 52, 'synset': 'barge.n.01', 'synonyms': ['barge'], 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'id': 53, 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'id': 54, 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'id': 55, 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'id': 56, 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'id': 57, 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'id': 58, 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'id': 60, 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'id': 61, 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'id': 62, 'synset': 'basket.n.03', 'synonyms': ['basketball_hoop'], 'def': 'metal hoop supporting a net through which players try to throw the basketball', 'name': 'basketball_hoop'}, {'frequency': 'c', 'id': 63, 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'id': 64, 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'r', 'id': 65, 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'id': 66, 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'id': 67, 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'id': 68, 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'id': 69, 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'id': 70, 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'id': 71, 'synset': 'battery.n.02', 'synonyms': ['battery'], 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'id': 72, 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'id': 73, 'synset': 'bead.n.01', 'synonyms': ['bead'], 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'r', 'id': 74, 'synset': 'beaker.n.01', 'synonyms': ['beaker'], 'def': 'a flatbottomed jar made of glass or plastic; used for chemistry', 'name': 'beaker'}, {'frequency': 'c', 'id': 75, 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'id': 76, 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'id': 77, 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'id': 78, 'synset': 'bear.n.01', 'synonyms': ['bear'], 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'id': 79, 'synset': 'bed.n.01', 'synonyms': ['bed'], 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'c', 'id': 80, 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'id': 81, 'synset': 'beef.n.01', 'synonyms': ['cow'], 'def': 'cattle that are reared for their meat', 'name': 'cow'}, {'frequency': 'c', 'id': 82, 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'id': 83, 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'id': 84, 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'id': 85, 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'id': 86, 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'id': 87, 'synset': 'bell.n.01', 'synonyms': ['bell'], 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'id': 88, 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'id': 89, 'synset': 'belt.n.02', 'synonyms': ['belt'], 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'id': 90, 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'id': 91, 'synset': 'bench.n.01', 'synonyms': ['bench'], 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'id': 92, 'synset': 'beret.n.01', 'synonyms': ['beret'], 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'id': 93, 'synset': 'bib.n.02', 'synonyms': ['bib'], 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'id': 94, 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'id': 95, 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'id': 96, 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'c', 'id': 97, 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'id': 98, 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'id': 99, 'synset': 'bird.n.01', 'synonyms': ['bird'], 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'r', 'id': 100, 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'r', 'id': 101, 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'id': 102, 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'id': 103, 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'id': 104, 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'id': 105, 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'id': 106, 'synset': 'biscuit.n.01', 'synonyms': ['biscuit_(bread)'], 'def': 'small round bread leavened with baking-powder or soda', 'name': 'biscuit_(bread)'}, {'frequency': 'r', 'id': 107, 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'id': 108, 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'id': 109, 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'id': 110, 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'id': 111, 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'id': 112, 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'id': 113, 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'c', 'id': 114, 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'c', 'id': 115, 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'id': 116, 'synset': 'boar.n.02', 'synonyms': ['boar'], 'def': 'an uncastrated male hog', 'name': 'boar'}, {'frequency': 'r', 'id': 117, 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'id': 118, 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'c', 'id': 119, 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'r', 'id': 120, 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'id': 121, 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'id': 122, 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'id': 123, 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'id': 124, 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'id': 125, 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'id': 126, 'synset': 'book.n.01', 'synonyms': ['book'], 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'r', 'id': 127, 'synset': 'book_bag.n.01', 'synonyms': ['book_bag'], 'def': 'a bag in which students carry their books', 'name': 'book_bag'}, {'frequency': 'c', 'id': 128, 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'id': 129, 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'id': 130, 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'id': 131, 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'id': 132, 'synset': 'boot.n.01', 'synonyms': ['boot'], 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'id': 133, 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'id': 134, 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'id': 135, 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'id': 136, 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'id': 137, 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'id': 138, 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'id': 139, 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'id': 140, 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'id': 141, 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'id': 142, 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'r', 'id': 143, 'synset': 'bowling_pin.n.01', 'synonyms': ['bowling_pin'], 'def': 'a club-shaped wooden object used in bowling', 'name': 'bowling_pin'}, {'frequency': 'r', 'id': 144, 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'id': 145, 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'id': 146, 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'id': 147, 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'id': 148, 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'id': 149, 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'r', 'id': 150, 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'c', 'id': 151, 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'id': 152, 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'c', 'id': 153, 'synset': 'bristle_brush.n.01', 'synonyms': ['bristle_brush'], 'def': 'a brush that is made with the short stiff hairs of an animal or plant', 'name': 'bristle_brush'}, {'frequency': 'f', 'id': 154, 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'id': 155, 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'id': 156, 'synset': 'broom.n.01', 'synonyms': ['broom'], 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'id': 157, 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'id': 158, 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'id': 159, 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'id': 160, 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'id': 161, 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'id': 162, 'synset': 'bull.n.11', 'synonyms': ['bull'], 'def': 'mature male cow', 'name': 'bull'}, {'frequency': 'r', 'id': 163, 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'id': 164, 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'id': 165, 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'id': 166, 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'id': 167, 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'id': 168, 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'r', 'id': 169, 'synset': 'bully_beef.n.01', 'synonyms': ['corned_beef', 'corn_beef'], 'def': 'beef cured or pickled in brine', 'name': 'corned_beef'}, {'frequency': 'f', 'id': 170, 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'id': 171, 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'id': 172, 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'id': 173, 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'id': 174, 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'id': 175, 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'c', 'id': 176, 'synset': 'butcher_knife.n.01', 'synonyms': ['butcher_knife'], 'def': 'a large sharp knife for cutting or trimming meat', 'name': 'butcher_knife'}, {'frequency': 'c', 'id': 177, 'synset': 'butter.n.01', 'synonyms': ['butter'], 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'id': 178, 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'id': 179, 'synset': 'button.n.01', 'synonyms': ['button'], 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'id': 180, 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'id': 181, 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'r', 'id': 182, 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'id': 183, 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'id': 184, 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'id': 185, 'synset': 'cake.n.03', 'synonyms': ['cake'], 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'id': 186, 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'id': 187, 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'id': 188, 'synset': 'calf.n.01', 'synonyms': ['calf'], 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'id': 189, 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'id': 190, 'synset': 'camel.n.01', 'synonyms': ['camel'], 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'id': 191, 'synset': 'camera.n.01', 'synonyms': ['camera'], 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'id': 192, 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'id': 193, 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'id': 194, 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'id': 195, 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'r', 'id': 196, 'synset': 'candelabrum.n.01', 'synonyms': ['candelabrum', 'candelabra'], 'def': 'branched candlestick; ornamental; has several lights', 'name': 'candelabrum'}, {'frequency': 'f', 'id': 197, 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'id': 198, 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'id': 199, 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'id': 200, 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'id': 201, 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'id': 202, 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'r', 'id': 203, 'synset': 'cannon.n.02', 'synonyms': ['cannon'], 'def': 'heavy gun fired from a tank', 'name': 'cannon'}, {'frequency': 'c', 'id': 204, 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'r', 'id': 205, 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'id': 206, 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'c', 'id': 207, 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'id': 208, 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'r', 'id': 209, 'synset': 'cape.n.02', 'synonyms': ['cape'], 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'id': 210, 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'id': 211, 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'id': 212, 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'def': 'a wheeled vehicle adapted to the rails of railroad', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'id': 213, 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'id': 214, 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'id': 215, 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'id': 216, 'synset': 'card.n.03', 'synonyms': ['card'], 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'r', 'id': 217, 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'id': 218, 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'id': 219, 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'id': 220, 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'id': 221, 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'c', 'id': 222, 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'id': 223, 'synset': 'cart.n.01', 'synonyms': ['cart'], 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'id': 224, 'synset': 'carton.n.02', 'synonyms': ['carton'], 'def': 'a box made of cardboard; opens by flaps on top', 'name': 'carton'}, {'frequency': 'c', 'id': 225, 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'id': 226, 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'id': 227, 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'id': 228, 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'id': 229, 'synset': 'cat.n.01', 'synonyms': ['cat'], 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'c', 'id': 230, 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'r', 'id': 231, 'synset': 'caviar.n.01', 'synonyms': ['caviar', 'caviare'], 'def': "salted roe of sturgeon or other large fish; usually served as an hors d'oeuvre", 'name': 'caviar'}, {'frequency': 'c', 'id': 232, 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'id': 233, 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'c', 'id': 234, 'synset': 'celery.n.01', 'synonyms': ['celery'], 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'id': 235, 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'id': 236, 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'id': 237, 'synset': 'chair.n.01', 'synonyms': ['chair'], 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'id': 238, 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'id': 239, 'synset': 'champagne.n.01', 'synonyms': ['champagne'], 'def': 'a white sparkling wine produced in Champagne or resembling that produced there', 'name': 'champagne'}, {'frequency': 'f', 'id': 240, 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'id': 241, 'synset': 'chap.n.04', 'synonyms': ['chap'], 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'id': 242, 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'id': 243, 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'id': 244, 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'id': 245, 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'r', 'id': 246, 'synset': 'chest_of_drawers.n.01', 'synonyms': ['chest_of_drawers_(furniture)', 'bureau_(furniture)', 'chest_(furniture)'], 'def': 'furniture with drawers for keeping clothes', 'name': 'chest_of_drawers_(furniture)'}, {'frequency': 'c', 'id': 247, 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'id': 248, 'synset': 'chicken_wire.n.01', 'synonyms': ['chicken_wire'], 'def': 'a galvanized wire network with a hexagonal mesh; used to build fences', 'name': 'chicken_wire'}, {'frequency': 'r', 'id': 249, 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'r', 'id': 250, 'synset': 'chihuahua.n.03', 'synonyms': ['Chihuahua'], 'def': 'an old breed of tiny short-haired dog with protruding eyes from Mexico', 'name': 'Chihuahua'}, {'frequency': 'r', 'id': 251, 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'id': 252, 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'id': 253, 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'id': 254, 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'id': 255, 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'id': 256, 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'id': 257, 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'id': 258, 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'id': 259, 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'id': 260, 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'def': 'necklace that fits tightly around the neck', 'name': 'choker'}, {'frequency': 'f', 'id': 261, 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'c', 'id': 262, 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'id': 263, 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'id': 264, 'synset': 'chute.n.02', 'synonyms': ['slide'], 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'id': 265, 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'id': 266, 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'c', 'id': 267, 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'id': 268, 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'id': 269, 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'id': 270, 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'r', 'id': 271, 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'id': 272, 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'id': 273, 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'id': 274, 'synset': 'clip.n.03', 'synonyms': ['clip'], 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'id': 275, 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'f', 'id': 276, 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'id': 277, 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'id': 278, 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'id': 279, 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'id': 280, 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'id': 281, 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'id': 282, 'synset': 'coat.n.01', 'synonyms': ['coat'], 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'id': 283, 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'r', 'id': 284, 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'id': 285, 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'c', 'id': 286, 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'r', 'id': 287, 'synset': 'coffee_filter.n.01', 'synonyms': ['coffee_filter'], 'def': 'filter (usually of paper) that passes the coffee and retains the coffee grounds', 'name': 'coffee_filter'}, {'frequency': 'f', 'id': 288, 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'id': 289, 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'id': 290, 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'id': 291, 'synset': 'coil.n.05', 'synonyms': ['coil'], 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'id': 292, 'synset': 'coin.n.01', 'synonyms': ['coin'], 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'r', 'id': 293, 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'id': 294, 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'id': 295, 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'id': 296, 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'id': 297, 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'id': 298, 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'f', 'id': 299, 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'r', 'id': 300, 'synset': 'concrete_mixer.n.01', 'synonyms': ['concrete_mixer', 'cement_mixer'], 'def': 'a machine with a large revolving drum in which cement/concrete is mixed', 'name': 'concrete_mixer'}, {'frequency': 'f', 'id': 301, 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'id': 302, 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'id': 303, 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'id': 304, 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'c', 'id': 305, 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'id': 306, 'synset': 'cookie_jar.n.01', 'synonyms': ['cookie_jar', 'cooky_jar'], 'def': 'a jar in which cookies are kept (and sometimes money is hidden)', 'name': 'cookie_jar'}, {'frequency': 'r', 'id': 307, 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'id': 308, 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'c', 'id': 309, 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'id': 310, 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'r', 'id': 311, 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'c', 'id': 312, 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'def': 'ears of corn that can be prepared and served for human food', 'name': 'edible_corn'}, {'frequency': 'r', 'id': 313, 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'id': 314, 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'id': 315, 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'id': 316, 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'r', 'id': 317, 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'r', 'id': 318, 'synset': 'cos.n.02', 'synonyms': ['romaine_lettuce'], 'def': 'lettuce with long dark-green leaves in a loosely packed elongated head', 'name': 'romaine_lettuce'}, {'frequency': 'c', 'id': 319, 'synset': 'costume.n.04', 'synonyms': ['costume'], 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'id': 320, 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'id': 321, 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'r', 'id': 322, 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'id': 323, 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'r', 'id': 324, 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'c', 'id': 325, 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'id': 326, 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'id': 327, 'synset': 'crate.n.01', 'synonyms': ['crate'], 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'r', 'id': 328, 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'id': 329, 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'r', 'id': 330, 'synset': 'credit_card.n.01', 'synonyms': ['credit_card', 'charge_card', 'debit_card'], 'def': 'a card, usually plastic, used to pay for goods and services', 'name': 'credit_card'}, {'frequency': 'c', 'id': 331, 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'id': 332, 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'id': 333, 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'def': 'an earthen jar (made of baked clay)', 'name': 'crock_pot'}, {'frequency': 'f', 'id': 334, 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'id': 335, 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'r', 'id': 336, 'synset': 'crow.n.01', 'synonyms': ['crow'], 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'c', 'id': 337, 'synset': 'crown.n.04', 'synonyms': ['crown'], 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'id': 338, 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'id': 339, 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'id': 340, 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'c', 'id': 341, 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'r', 'id': 342, 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'id': 343, 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'r', 'id': 344, 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'id': 345, 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'id': 346, 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'id': 347, 'synset': 'cup.n.01', 'synonyms': ['cup'], 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'id': 348, 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'def': 'a metal vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'c', 'id': 349, 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'id': 350, 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'id': 351, 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'id': 352, 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'id': 353, 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'id': 354, 'synset': 'custard.n.01', 'synonyms': ['custard'], 'def': 'sweetened mixture of milk and eggs baked or boiled or frozen', 'name': 'custard'}, {'frequency': 'c', 'id': 355, 'synset': 'cutter.n.06', 'synonyms': ['cutting_tool'], 'def': 'a cutting implement; a tool for cutting', 'name': 'cutting_tool'}, {'frequency': 'r', 'id': 356, 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'id': 357, 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'id': 358, 'synset': 'dachshund.n.01', 'synonyms': ['dachshund', 'dachsie', 'badger_dog'], 'def': 'small long-bodied short-legged breed of dog having a short sleek coat and long drooping ears', 'name': 'dachshund'}, {'frequency': 'r', 'id': 359, 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'id': 360, 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'id': 361, 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'id': 362, 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'id': 363, 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'id': 364, 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'id': 365, 'synset': 'desk.n.01', 'synonyms': ['desk'], 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'id': 366, 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'id': 367, 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'id': 368, 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'def': 'a daily written record of (usually personal) experiences and observations', 'name': 'diary'}, {'frequency': 'r', 'id': 369, 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'id': 370, 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'id': 371, 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'id': 372, 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'c', 'id': 373, 'synset': 'dish.n.01', 'synonyms': ['dish'], 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'id': 374, 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'id': 375, 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'def': 'a cloth for washing dishes', 'name': 'dishrag'}, {'frequency': 'c', 'id': 376, 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'id': 377, 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'id': 378, 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid'], 'def': 'a low-sudsing detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'r', 'id': 379, 'synset': 'diskette.n.01', 'synonyms': ['diskette', 'floppy', 'floppy_disk'], 'def': 'a small plastic magnetic disk enclosed in a stiff envelope used to store data', 'name': 'diskette'}, {'frequency': 'c', 'id': 380, 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'c', 'id': 381, 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'id': 382, 'synset': 'dog.n.01', 'synonyms': ['dog'], 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'id': 383, 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'c', 'id': 384, 'synset': 'doll.n.01', 'synonyms': ['doll'], 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'id': 385, 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'id': 386, 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'id': 387, 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'r', 'id': 388, 'synset': 'domino.n.03', 'synonyms': ['eye_mask'], 'def': 'a mask covering the upper part of the face but with holes for the eyes', 'name': 'eye_mask'}, {'frequency': 'r', 'id': 389, 'synset': 'doorbell.n.01', 'synonyms': ['doorbell', 'buzzer'], 'def': 'a button at an outer door that gives a ringing or buzzing signal when pushed', 'name': 'doorbell'}, {'frequency': 'f', 'id': 390, 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'id': 391, 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'id': 392, 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'id': 393, 'synset': 'dove.n.01', 'synonyms': ['dove'], 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'id': 394, 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'id': 395, 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'id': 396, 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'id': 397, 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'id': 398, 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'c', 'id': 399, 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'c', 'id': 400, 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'id': 401, 'synset': 'drill.n.01', 'synonyms': ['drill'], 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'id': 402, 'synset': 'drinking_fountain.n.01', 'synonyms': ['drinking_fountain'], 'def': 'a public fountain to provide a jet of drinking water', 'name': 'drinking_fountain'}, {'frequency': 'r', 'id': 403, 'synset': 'drone.n.04', 'synonyms': ['drone'], 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'id': 404, 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'id': 405, 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'id': 406, 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'id': 407, 'synset': 'duck.n.01', 'synonyms': ['duck'], 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'r', 'id': 408, 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'id': 409, 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'id': 410, 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'def': 'a large cylindrical bag of heavy cloth', 'name': 'duffel_bag'}, {'frequency': 'r', 'id': 411, 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'id': 412, 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'id': 413, 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'r', 'id': 414, 'synset': 'dutch_oven.n.02', 'synonyms': ['Dutch_oven'], 'def': 'iron or earthenware cooking pot; used for stews', 'name': 'Dutch_oven'}, {'frequency': 'c', 'id': 415, 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'id': 416, 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'id': 417, 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'id': 418, 'synset': 'earring.n.01', 'synonyms': ['earring'], 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'id': 419, 'synset': 'easel.n.01', 'synonyms': ['easel'], 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'id': 420, 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'id': 421, 'synset': 'eel.n.01', 'synonyms': ['eel'], 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'id': 422, 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'id': 423, 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'id': 424, 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'id': 425, 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'id': 426, 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'id': 427, 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'id': 428, 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'id': 429, 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'r', 'id': 430, 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'id': 431, 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'id': 432, 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'id': 433, 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'id': 434, 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'id': 435, 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'id': 436, 'synset': 'fan.n.01', 'synonyms': ['fan'], 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'id': 437, 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'id': 438, 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'id': 439, 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'id': 440, 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'r', 'id': 441, 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'id': 442, 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'id': 443, 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'id': 444, 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'id': 445, 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'id': 446, 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'id': 447, 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'c', 'id': 448, 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'c', 'id': 449, 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'id': 450, 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'id': 451, 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'id': 452, 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'c', 'id': 453, 'synset': 'fish.n.01', 'synonyms': ['fish'], 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'r', 'id': 454, 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'id': 455, 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'r', 'id': 456, 'synset': 'fishing_boat.n.01', 'synonyms': ['fishing_boat', 'fishing_vessel'], 'def': 'a vessel for fishing', 'name': 'fishing_boat'}, {'frequency': 'c', 'id': 457, 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'id': 458, 'synset': 'flag.n.01', 'synonyms': ['flag'], 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'id': 459, 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'id': 460, 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'id': 461, 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'r', 'id': 462, 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'id': 463, 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'id': 464, 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'id': 465, 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'id': 466, 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'id': 467, 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'id': 468, 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'r', 'id': 469, 'synset': 'foal.n.01', 'synonyms': ['foal'], 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'id': 470, 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'id': 471, 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'id': 472, 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'id': 473, 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'id': 474, 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'id': 475, 'synset': 'fork.n.01', 'synonyms': ['fork'], 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'r', 'id': 476, 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'r', 'id': 477, 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'r', 'id': 478, 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'id': 479, 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'def': 'anything that freshens', 'name': 'freshener'}, {'frequency': 'f', 'id': 480, 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'id': 481, 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'id': 482, 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'r', 'id': 483, 'synset': 'fruit_salad.n.01', 'synonyms': ['fruit_salad'], 'def': 'salad composed of fruits', 'name': 'fruit_salad'}, {'frequency': 'c', 'id': 484, 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'id': 485, 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'id': 486, 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'c', 'id': 487, 'synset': 'futon.n.01', 'synonyms': ['futon'], 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'id': 488, 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'id': 489, 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'id': 490, 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'id': 491, 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'id': 492, 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'id': 493, 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'id': 494, 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'id': 495, 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'r', 'id': 496, 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'id': 497, 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'id': 498, 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'c', 'id': 499, 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'id': 500, 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'id': 501, 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'id': 502, 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'id': 503, 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'id': 504, 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'id': 505, 'synset': 'globe.n.03', 'synonyms': ['globe'], 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'id': 506, 'synset': 'glove.n.02', 'synonyms': ['glove'], 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'id': 507, 'synset': 'goat.n.01', 'synonyms': ['goat'], 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'id': 508, 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'id': 509, 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'r', 'id': 510, 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'id': 511, 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'id': 512, 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'id': 513, 'synset': 'goose.n.01', 'synonyms': ['goose'], 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'id': 514, 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'id': 515, 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'r', 'id': 516, 'synset': 'gown.n.04', 'synonyms': ['surgical_gown', 'scrubs_(surgical_clothing)'], 'def': 'protective garment worn by surgeons during operations', 'name': 'surgical_gown'}, {'frequency': 'f', 'id': 517, 'synset': 'grape.n.01', 'synonyms': ['grape'], 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'r', 'id': 518, 'synset': 'grasshopper.n.01', 'synonyms': ['grasshopper'], 'def': 'plant-eating insect with hind legs adapted for leaping', 'name': 'grasshopper'}, {'frequency': 'c', 'id': 519, 'synset': 'grater.n.01', 'synonyms': ['grater'], 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'id': 520, 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'id': 521, 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'c', 'id': 522, 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'c', 'id': 523, 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'id': 524, 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'r', 'id': 525, 'synset': 'grillroom.n.01', 'synonyms': ['grillroom', 'grill_(restaurant)'], 'def': 'a restaurant where food is cooked on a grill', 'name': 'grillroom'}, {'frequency': 'r', 'id': 526, 'synset': 'grinder.n.04', 'synonyms': ['grinder_(tool)'], 'def': 'a machine tool that polishes metal', 'name': 'grinder_(tool)'}, {'frequency': 'r', 'id': 527, 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'id': 528, 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'id': 529, 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'r', 'id': 530, 'synset': 'guacamole.n.01', 'synonyms': ['guacamole'], 'def': 'a dip made of mashed avocado mixed with chopped onions and other seasonings', 'name': 'guacamole'}, {'frequency': 'f', 'id': 531, 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'id': 532, 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'id': 533, 'synset': 'gun.n.01', 'synonyms': ['gun'], 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'r', 'id': 534, 'synset': 'hair_spray.n.01', 'synonyms': ['hair_spray'], 'def': 'substance sprayed on the hair to hold it in place', 'name': 'hair_spray'}, {'frequency': 'c', 'id': 535, 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'id': 536, 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'id': 537, 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'f', 'id': 538, 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'id': 539, 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'id': 540, 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'r', 'id': 541, 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'id': 542, 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'r', 'id': 543, 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'c', 'id': 544, 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'id': 545, 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'id': 546, 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'id': 547, 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'id': 548, 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'id': 549, 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'id': 550, 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'id': 551, 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'id': 552, 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'id': 553, 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'id': 554, 'synset': 'hat.n.01', 'synonyms': ['hat'], 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'id': 555, 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'r', 'id': 556, 'synset': 'hatch.n.03', 'synonyms': ['hatch'], 'def': 'a movable barrier covering a hatchway', 'name': 'hatch'}, {'frequency': 'c', 'id': 557, 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'def': 'a garment that covers the head and face', 'name': 'veil'}, {'frequency': 'f', 'id': 558, 'synset': 'headband.n.01', 'synonyms': ['headband'], 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'id': 559, 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'id': 560, 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'id': 561, 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'id': 562, 'synset': 'headset.n.01', 'synonyms': ['headset'], 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'id': 563, 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'r', 'id': 564, 'synset': 'hearing_aid.n.02', 'synonyms': ['hearing_aid'], 'def': 'an acoustic device used to direct sound to the ear of a hearing-impaired person', 'name': 'hearing_aid'}, {'frequency': 'c', 'id': 565, 'synset': 'heart.n.02', 'synonyms': ['heart'], 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'id': 566, 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'id': 567, 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'id': 568, 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'id': 569, 'synset': 'heron.n.02', 'synonyms': ['heron'], 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'id': 570, 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'id': 571, 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'id': 572, 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'id': 573, 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'id': 574, 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'id': 575, 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'id': 576, 'synset': 'honey.n.01', 'synonyms': ['honey'], 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'id': 577, 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'id': 578, 'synset': 'hook.n.05', 'synonyms': ['hook'], 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'f', 'id': 579, 'synset': 'horse.n.01', 'synonyms': ['horse'], 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'id': 580, 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'id': 581, 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'id': 582, 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'id': 583, 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'id': 584, 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'id': 585, 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'r', 'id': 586, 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'id': 587, 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'c', 'id': 588, 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'id': 589, 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'id': 590, 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'id': 591, 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'id': 592, 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'id': 593, 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'r', 'id': 594, 'synset': 'ice_tea.n.01', 'synonyms': ['ice_tea', 'iced_tea'], 'def': 'strong tea served over ice', 'name': 'ice_tea'}, {'frequency': 'c', 'id': 595, 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'id': 596, 'synset': 'incense.n.01', 'synonyms': ['incense'], 'def': 'a substance that produces a fragrant odor when burned', 'name': 'incense'}, {'frequency': 'r', 'id': 597, 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'c', 'id': 598, 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'id': 599, 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'r', 'id': 600, 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'id': 601, 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'r', 'id': 602, 'synset': 'jam.n.01', 'synonyms': ['jam'], 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'id': 603, 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'id': 604, 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'id': 605, 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'id': 606, 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'id': 607, 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'c', 'id': 608, 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'id': 609, 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'r', 'id': 610, 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'id': 611, 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'id': 612, 'synset': 'keg.n.02', 'synonyms': ['keg'], 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'id': 613, 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'id': 614, 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'id': 615, 'synset': 'key.n.01', 'synonyms': ['key'], 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'id': 616, 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'r', 'id': 617, 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'id': 618, 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'id': 619, 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'c', 'id': 620, 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'id': 621, 'synset': 'kite.n.03', 'synonyms': ['kite'], 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'id': 622, 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'id': 623, 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'id': 624, 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'id': 625, 'synset': 'knife.n.01', 'synonyms': ['knife'], 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'id': 626, 'synset': 'knight.n.02', 'synonyms': ['knight_(chess_piece)', 'horse_(chess_piece)'], 'def': 'a chess game piece shaped to resemble the head of a horse', 'name': 'knight_(chess_piece)'}, {'frequency': 'r', 'id': 627, 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'id': 628, 'synset': 'knob.n.02', 'synonyms': ['knob'], 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'id': 629, 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'id': 630, 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'id': 631, 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'id': 632, 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'id': 633, 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'r', 'id': 634, 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'c', 'id': 635, 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'id': 636, 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'id': 637, 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'id': 638, 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'id': 639, 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'id': 640, 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'id': 641, 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'id': 642, 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'id': 643, 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'c', 'id': 644, 'synset': 'latch.n.02', 'synonyms': ['latch'], 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'id': 645, 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'id': 646, 'synset': 'leather.n.01', 'synonyms': ['leather'], 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'id': 647, 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'id': 648, 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'f', 'id': 649, 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'id': 650, 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'id': 651, 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'id': 652, 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'id': 653, 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'id': 654, 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'id': 655, 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'def': 'glass bulb or tube shaped electric device that emits light (DO NOT MARK LAMPS AS A WHOLE)', 'name': 'lightbulb'}, {'frequency': 'r', 'id': 656, 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'c', 'id': 657, 'synset': 'lime.n.06', 'synonyms': ['lime'], 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'id': 658, 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'r', 'id': 659, 'synset': 'linen.n.02', 'synonyms': ['linen_paper'], 'def': 'a high-quality paper made of linen fibers or with a linen finish', 'name': 'linen_paper'}, {'frequency': 'c', 'id': 660, 'synset': 'lion.n.01', 'synonyms': ['lion'], 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'id': 661, 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'c', 'id': 662, 'synset': 'lipstick.n.01', 'synonyms': ['lipstick', 'lip_rouge'], 'def': 'makeup that is used to color the lips', 'name': 'lipstick'}, {'frequency': 'r', 'id': 663, 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'def': 'an alcoholic beverage that is distilled rather than fermented', 'name': 'liquor'}, {'frequency': 'r', 'id': 664, 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'r', 'id': 665, 'synset': 'loafer.n.02', 'synonyms': ['Loafer_(type_of_shoe)'], 'def': 'a low leather step-in shoe', 'name': 'Loafer_(type_of_shoe)'}, {'frequency': 'f', 'id': 666, 'synset': 'log.n.01', 'synonyms': ['log'], 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'id': 667, 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'c', 'id': 668, 'synset': 'lotion.n.01', 'synonyms': ['lotion'], 'def': 'any of various cosmetic preparations that are applied to the skin', 'name': 'lotion'}, {'frequency': 'f', 'id': 669, 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'id': 670, 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'id': 671, 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'id': 672, 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'id': 673, 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'r', 'id': 674, 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'c', 'id': 675, 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'id': 676, 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'id': 677, 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'c', 'id': 678, 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'id': 679, 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'id': 680, 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'c', 'id': 681, 'synset': 'map.n.01', 'synonyms': ['map'], 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'c', 'id': 682, 'synset': 'marker.n.03', 'synonyms': ['marker'], 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'id': 683, 'synset': 'martini.n.01', 'synonyms': ['martini'], 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'id': 684, 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'id': 685, 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'id': 686, 'synset': 'masher.n.02', 'synonyms': ['masher'], 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'id': 687, 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'id': 688, 'synset': 'mast.n.01', 'synonyms': ['mast'], 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'id': 689, 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'id': 690, 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'id': 691, 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'id': 692, 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'id': 693, 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'id': 694, 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'id': 695, 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'r', 'id': 696, 'synset': 'melon.n.01', 'synonyms': ['melon'], 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'id': 697, 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'id': 698, 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'id': 699, 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'id': 700, 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'c', 'id': 701, 'synset': 'milk.n.01', 'synonyms': ['milk'], 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'f', 'id': 702, 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'id': 703, 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'id': 704, 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'id': 705, 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'id': 706, 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'id': 707, 'synset': 'money.n.03', 'synonyms': ['money'], 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'id': 708, 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'id': 709, 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'id': 710, 'synset': 'motor.n.01', 'synonyms': ['motor'], 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'id': 711, 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'id': 712, 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'r', 'id': 713, 'synset': 'motorboat.n.01', 'synonyms': ['motorboat', 'powerboat'], 'def': 'a boat propelled by an internal-combustion engine', 'name': 'motorboat'}, {'frequency': 'f', 'id': 714, 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'id': 715, 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'r', 'id': 716, 'synset': 'mouse.n.01', 'synonyms': ['mouse_(animal_rodent)'], 'def': 'a small rodent with pointed snouts and small ears on elongated bodies with slender usually hairless tails', 'name': 'mouse_(animal_rodent)'}, {'frequency': 'f', 'id': 717, 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'def': 'a computer input device that controls an on-screen pointer', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'id': 718, 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'id': 719, 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'id': 720, 'synset': 'mug.n.04', 'synonyms': ['mug'], 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'id': 721, 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'id': 722, 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'r', 'id': 723, 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'id': 724, 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'r', 'id': 725, 'synset': 'nameplate.n.01', 'synonyms': ['nameplate'], 'def': 'a plate bearing a name', 'name': 'nameplate'}, {'frequency': 'f', 'id': 726, 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'id': 727, 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'id': 728, 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'id': 729, 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'r', 'id': 730, 'synset': 'needle.n.03', 'synonyms': ['needle'], 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'id': 731, 'synset': 'nest.n.01', 'synonyms': ['nest'], 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'r', 'id': 732, 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'id': 733, 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'id': 734, 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'r', 'id': 735, 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'id': 736, 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'id': 737, 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'c', 'id': 738, 'synset': 'nut.n.03', 'synonyms': ['nut'], 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'id': 739, 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'c', 'id': 740, 'synset': 'oar.n.01', 'synonyms': ['oar'], 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'id': 741, 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'id': 742, 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'id': 743, 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'id': 744, 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'id': 745, 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'id': 746, 'synset': 'onion.n.01', 'synonyms': ['onion'], 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'id': 747, 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'id': 748, 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'r', 'id': 749, 'synset': 'oregano.n.01', 'synonyms': ['oregano', 'marjoram'], 'def': 'aromatic Eurasian perennial herb used in cooking and baking', 'name': 'oregano'}, {'frequency': 'c', 'id': 750, 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'c', 'id': 751, 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'def': 'thick cushion used as a seat', 'name': 'ottoman'}, {'frequency': 'c', 'id': 752, 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'id': 753, 'synset': 'owl.n.01', 'synonyms': ['owl'], 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'id': 754, 'synset': 'packet.n.03', 'synonyms': ['packet'], 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'id': 755, 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'id': 756, 'synset': 'pad.n.04', 'synonyms': ['pad'], 'def': 'a flat mass of soft material used for protection, stuffing, or comfort', 'name': 'pad'}, {'frequency': 'c', 'id': 757, 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'id': 758, 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'r', 'id': 759, 'synset': 'paintbox.n.01', 'synonyms': ['paintbox'], 'def': "a box containing a collection of cubes or tubes of artists' paint", 'name': 'paintbox'}, {'frequency': 'c', 'id': 760, 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'id': 761, 'synset': 'painting.n.01', 'synonyms': ['painting'], 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'c', 'id': 762, 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'id': 763, 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'id': 764, 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'id': 765, 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'id': 766, 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'id': 767, 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'id': 768, 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'r', 'id': 769, 'synset': 'paper_clip.n.01', 'synonyms': ['paperclip'], 'def': 'a wire or plastic clip for holding sheets of paper together', 'name': 'paperclip'}, {'frequency': 'f', 'id': 770, 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'id': 771, 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'id': 772, 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'id': 773, 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'id': 774, 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'r', 'id': 775, 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'id': 776, 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'r', 'id': 777, 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'r', 'id': 778, 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'id': 779, 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'id': 780, 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'id': 781, 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'id': 782, 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'r', 'id': 783, 'synset': 'passport.n.02', 'synonyms': ['passport'], 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'id': 784, 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'id': 785, 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'id': 786, 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'id': 787, 'synset': 'peach.n.03', 'synonyms': ['peach'], 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'id': 788, 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'c', 'id': 789, 'synset': 'pear.n.01', 'synonyms': ['pear'], 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'r', 'id': 790, 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'id': 791, 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'id': 792, 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'id': 793, 'synset': 'pen.n.01', 'synonyms': ['pen'], 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'c', 'id': 794, 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'id': 795, 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'id': 796, 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'id': 797, 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'id': 798, 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'id': 799, 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'id': 800, 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'c', 'id': 801, 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'id': 802, 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'id': 803, 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'id': 804, 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'id': 805, 'synset': 'person.n.01', 'synonyms': ['baby', 'child', 'boy', 'girl', 'man', 'woman', 'person', 'human'], 'def': 'a human being', 'name': 'baby'}, {'frequency': 'r', 'id': 806, 'synset': 'pet.n.01', 'synonyms': ['pet'], 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'r', 'id': 807, 'synset': 'petfood.n.01', 'synonyms': ['petfood', 'pet-food'], 'def': 'food prepared for animal pets', 'name': 'petfood'}, {'frequency': 'r', 'id': 808, 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'id': 809, 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'id': 810, 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'c', 'id': 811, 'synset': 'piano.n.01', 'synonyms': ['piano'], 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'id': 812, 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'id': 813, 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'id': 814, 'synset': 'pie.n.01', 'synonyms': ['pie'], 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'id': 815, 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'id': 816, 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'id': 817, 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'id': 818, 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'id': 819, 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'id': 820, 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'id': 821, 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'id': 822, 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'id': 823, 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'id': 824, 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'id': 825, 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'r', 'id': 826, 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'id': 827, 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'id': 828, 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'id': 829, 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'id': 830, 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'id': 831, 'synset': 'plate.n.04', 'synonyms': ['plate'], 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'id': 832, 'synset': 'platter.n.01', 'synonyms': ['platter'], 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'id': 833, 'synset': 'playing_card.n.01', 'synonyms': ['playing_card'], 'def': 'one of a pack of cards that are used to play card games', 'name': 'playing_card'}, {'frequency': 'r', 'id': 834, 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'id': 835, 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'id': 836, 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'id': 837, 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'id': 838, 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'id': 839, 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'id': 840, 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'r', 'id': 841, 'synset': 'police_van.n.01', 'synonyms': ['police_van', 'police_wagon', 'paddy_wagon', 'patrol_wagon'], 'def': 'van used by police to transport prisoners', 'name': 'police_van'}, {'frequency': 'f', 'id': 842, 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'id': 843, 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'id': 844, 'synset': 'pony.n.05', 'synonyms': ['pony'], 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'id': 845, 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'id': 846, 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'r', 'id': 847, 'synset': 'portrait.n.02', 'synonyms': ['portrait', 'portrayal'], 'def': 'any likeness of a person, in any medium', 'name': 'portrait'}, {'frequency': 'c', 'id': 848, 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'id': 849, 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'id': 850, 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'id': 851, 'synset': 'pot.n.01', 'synonyms': ['pot'], 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'id': 852, 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'id': 853, 'synset': 'potato.n.01', 'synonyms': ['potato'], 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'id': 854, 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'id': 855, 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'id': 856, 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'r', 'id': 857, 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'id': 858, 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'f', 'id': 859, 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'id': 860, 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'id': 861, 'synset': 'projector.n.02', 'synonyms': ['projector'], 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'id': 862, 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'id': 863, 'synset': 'prune.n.01', 'synonyms': ['prune'], 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'id': 864, 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'id': 865, 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'id': 866, 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'id': 867, 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'id': 868, 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'id': 869, 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'id': 870, 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'r', 'id': 871, 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'id': 872, 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'id': 873, 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'id': 874, 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'id': 875, 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'id': 876, 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'id': 877, 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'id': 878, 'synset': 'radar.n.01', 'synonyms': ['radar'], 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'c', 'id': 879, 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'id': 880, 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'id': 881, 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'id': 882, 'synset': 'raft.n.01', 'synonyms': ['raft'], 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'id': 883, 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'id': 884, 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'id': 885, 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'id': 886, 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'id': 887, 'synset': 'rat.n.01', 'synonyms': ['rat'], 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'id': 888, 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'id': 889, 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'id': 890, 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'def': 'car mirror that reflects the view out of the rear window', 'name': 'rearview_mirror'}, {'frequency': 'c', 'id': 891, 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'id': 892, 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'r', 'id': 893, 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'r', 'id': 894, 'synset': 'red_cabbage.n.02', 'synonyms': ['red_cabbage'], 'def': 'compact head of purplish-red leaves', 'name': 'red_cabbage'}, {'frequency': 'f', 'id': 895, 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'id': 896, 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'id': 897, 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'id': 898, 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'r', 'id': 899, 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'id': 900, 'synset': 'ring.n.08', 'synonyms': ['ring'], 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'id': 901, 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'id': 902, 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'id': 903, 'synset': 'robe.n.01', 'synonyms': ['robe'], 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'id': 904, 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'id': 905, 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'id': 906, 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'id': 907, 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'id': 908, 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'id': 909, 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'id': 910, 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'id': 911, 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'id': 912, 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'id': 913, 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'id': 914, 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'id': 915, 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'id': 916, 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'c', 'id': 917, 'synset': 'sail.n.01', 'synonyms': ['sail'], 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'c', 'id': 918, 'synset': 'salad.n.01', 'synonyms': ['salad'], 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'id': 919, 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'r', 'id': 920, 'synset': 'salami.n.01', 'synonyms': ['salami'], 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'r', 'id': 921, 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'id': 922, 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'r', 'id': 923, 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'id': 924, 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'id': 925, 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'id': 926, 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'id': 927, 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'id': 928, 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'id': 929, 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'id': 930, 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'id': 931, 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'id': 932, 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'id': 933, 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'id': 934, 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'id': 935, 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'id': 936, 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'id': 937, 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'c', 'id': 938, 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'c', 'id': 939, 'synset': 'scrambled_eggs.n.01', 'synonyms': ['scrambled_eggs'], 'def': 'eggs beaten and cooked to a soft firm consistency while stirring', 'name': 'scrambled_eggs'}, {'frequency': 'r', 'id': 940, 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'r', 'id': 941, 'synset': 'scratcher.n.03', 'synonyms': ['scratcher'], 'def': 'a device used for scratching', 'name': 'scratcher'}, {'frequency': 'c', 'id': 942, 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'c', 'id': 943, 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'id': 944, 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'r', 'id': 945, 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'r', 'id': 946, 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'id': 947, 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'id': 948, 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'r', 'id': 949, 'synset': 'seedling.n.01', 'synonyms': ['seedling'], 'def': 'young plant or tree grown from a seed', 'name': 'seedling'}, {'frequency': 'c', 'id': 950, 'synset': 'serving_dish.n.01', 'synonyms': ['serving_dish'], 'def': 'a dish used for serving food', 'name': 'serving_dish'}, {'frequency': 'r', 'id': 951, 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'r', 'id': 952, 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'id': 953, 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'r', 'id': 954, 'synset': 'shark.n.01', 'synonyms': ['shark'], 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'id': 955, 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'id': 956, 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'id': 957, 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'id': 958, 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'id': 959, 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'id': 960, 'synset': 'shears.n.01', 'synonyms': ['shears'], 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'id': 961, 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'id': 962, 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'id': 963, 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'r', 'id': 964, 'synset': 'shield.n.02', 'synonyms': ['shield'], 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'id': 965, 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'id': 966, 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'c', 'id': 967, 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'id': 968, 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'id': 969, 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'id': 970, 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'c', 'id': 971, 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'id': 972, 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'id': 973, 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'f', 'id': 974, 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'id': 975, 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'r', 'id': 976, 'synset': 'sieve.n.01', 'synonyms': ['sieve', 'screen_(sieve)'], 'def': 'a strainer for separating lumps from powdered material or grading particles', 'name': 'sieve'}, {'frequency': 'f', 'id': 977, 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'id': 978, 'synset': 'silo.n.01', 'synonyms': ['silo'], 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'id': 979, 'synset': 'sink.n.01', 'synonyms': ['sink'], 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'id': 980, 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'id': 981, 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'id': 982, 'synset': 'ski.n.01', 'synonyms': ['ski'], 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'id': 983, 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'id': 984, 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'id': 985, 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'id': 986, 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'c', 'id': 987, 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'id': 988, 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'id': 989, 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'id': 990, 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'id': 991, 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'id': 992, 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'id': 993, 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'id': 994, 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'id': 995, 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'id': 996, 'synset': 'soap.n.01', 'synonyms': ['soap'], 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'id': 997, 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'id': 998, 'synset': 'sock.n.01', 'synonyms': ['sock'], 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'r', 'id': 999, 'synset': 'soda_fountain.n.02', 'synonyms': ['soda_fountain'], 'def': 'an apparatus for dispensing soda water', 'name': 'soda_fountain'}, {'frequency': 'r', 'id': 1000, 'synset': 'soda_water.n.01', 'synonyms': ['carbonated_water', 'club_soda', 'seltzer', 'sparkling_water'], 'def': 'effervescent beverage artificially charged with carbon dioxide', 'name': 'carbonated_water'}, {'frequency': 'f', 'id': 1001, 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'id': 1002, 'synset': 'softball.n.01', 'synonyms': ['softball'], 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'id': 1003, 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'id': 1004, 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'c', 'id': 1005, 'synset': 'soup.n.01', 'synonyms': ['soup'], 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'id': 1006, 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'id': 1007, 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'id': 1008, 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'id': 1009, 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'id': 1010, 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'id': 1011, 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'id': 1012, 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'id': 1013, 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'id': 1014, 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'id': 1015, 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'r', 'id': 1016, 'synset': 'spider.n.01', 'synonyms': ['spider'], 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'c', 'id': 1017, 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'id': 1018, 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'id': 1019, 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'id': 1020, 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'id': 1021, 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'c', 'id': 1022, 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'r', 'id': 1023, 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'id': 1024, 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'id': 1025, 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'id': 1026, 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'r', 'id': 1027, 'synset': 'steamer.n.02', 'synonyms': ['steamer_(kitchen_appliance)'], 'def': 'a cooking utensil that can be used to cook food by steaming it', 'name': 'steamer_(kitchen_appliance)'}, {'frequency': 'f', 'id': 1028, 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'id': 1029, 'synset': 'stencil.n.01', 'synonyms': ['stencil'], 'def': 'a sheet of material (metal, plastic, etc.) that has been perforated with a pattern; ink or paint can pass through the perforations to create the printed pattern on the surface below', 'name': 'stencil'}, {'frequency': 'r', 'id': 1030, 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'id': 1031, 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'id': 1032, 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'id': 1033, 'synset': 'stew.n.02', 'synonyms': ['stew'], 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'id': 1034, 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'id': 1035, 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'c', 'id': 1036, 'synset': 'stocking.n.01', 'synonyms': ['stockings_(leg_wear)'], 'def': 'close-fitting hosiery to cover the foot and leg; come in matched pairs', 'name': 'stockings_(leg_wear)'}, {'frequency': 'f', 'id': 1037, 'synset': 'stool.n.01', 'synonyms': ['stool'], 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'id': 1038, 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'id': 1039, 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'id': 1040, 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'id': 1041, 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'id': 1042, 'synset': 'strap.n.01', 'synonyms': ['strap'], 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'id': 1043, 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'id': 1044, 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'id': 1045, 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'id': 1046, 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'id': 1047, 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'id': 1048, 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'def': 'a pointed tool for writing or drawing or engraving', 'name': 'stylus'}, {'frequency': 'r', 'id': 1049, 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'id': 1050, 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'id': 1051, 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'c', 'id': 1052, 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'id': 1053, 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'id': 1054, 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'id': 1055, 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'r', 'id': 1056, 'synset': 'sunscreen.n.01', 'synonyms': ['sunscreen', 'sunblock'], 'def': 'a cream spread on the skin; contains a chemical to filter out ultraviolet light and so protect from sunburn', 'name': 'sunscreen'}, {'frequency': 'f', 'id': 1057, 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'id': 1058, 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'id': 1059, 'synset': 'swab.n.02', 'synonyms': ['mop'], 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'id': 1060, 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'id': 1061, 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'id': 1062, 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'id': 1063, 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'id': 1064, 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'id': 1065, 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'id': 1066, 'synset': 'sword.n.01', 'synonyms': ['sword'], 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'id': 1067, 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'id': 1068, 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'id': 1069, 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'id': 1070, 'synset': 'table.n.02', 'synonyms': ['table'], 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'id': 1071, 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'id': 1072, 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'id': 1073, 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'id': 1074, 'synset': 'taco.n.02', 'synonyms': ['taco'], 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'id': 1075, 'synset': 'tag.n.02', 'synonyms': ['tag'], 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'id': 1076, 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'id': 1077, 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'id': 1078, 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'c', 'id': 1079, 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'id': 1080, 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'c', 'id': 1081, 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'id': 1082, 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'id': 1083, 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'id': 1084, 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'id': 1085, 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'id': 1086, 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'r', 'id': 1087, 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'id': 1088, 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'id': 1089, 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'c', 'id': 1090, 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'id': 1091, 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'id': 1092, 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'def': 'electronic device for communicating by voice over long distances', 'name': 'telephone'}, {'frequency': 'c', 'id': 1093, 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'id': 1094, 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'id': 1095, 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'id': 1096, 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'id': 1097, 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'id': 1098, 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'id': 1099, 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'id': 1100, 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'id': 1101, 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'id': 1102, 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'c', 'id': 1103, 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'id': 1104, 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'id': 1105, 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'id': 1106, 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'id': 1107, 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'id': 1108, 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'id': 1109, 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'id': 1110, 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'id': 1111, 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'r', 'id': 1112, 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'id': 1113, 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'id': 1114, 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'id': 1115, 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'c', 'id': 1116, 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'id': 1117, 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'id': 1118, 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'id': 1119, 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'c', 'id': 1120, 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'id': 1121, 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'id': 1122, 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'id': 1123, 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'c', 'id': 1124, 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'c', 'id': 1125, 'synset': 'top.n.09', 'synonyms': ['cover'], 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'id': 1126, 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'id': 1127, 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'id': 1128, 'synset': 'towel.n.01', 'synonyms': ['towel'], 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'id': 1129, 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'id': 1130, 'synset': 'toy.n.03', 'synonyms': ['toy'], 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'id': 1131, 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'id': 1132, 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'r', 'id': 1133, 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'c', 'id': 1134, 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'id': 1135, 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'id': 1136, 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'id': 1137, 'synset': 'tray.n.01', 'synonyms': ['tray'], 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'id': 1138, 'synset': 'tree_house.n.01', 'synonyms': ['tree_house'], 'def': '(NOT A TREE) a PLAYHOUSE built in the branches of a tree', 'name': 'tree_house'}, {'frequency': 'r', 'id': 1139, 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'id': 1140, 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'r', 'id': 1141, 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'c', 'id': 1142, 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'id': 1143, 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'id': 1144, 'synset': 'truck.n.01', 'synonyms': ['truck'], 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'id': 1145, 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'id': 1146, 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'id': 1147, 'synset': 'tub.n.02', 'synonyms': ['vat'], 'def': 'a large open vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'id': 1148, 'synset': 'turban.n.01', 'synonyms': ['turban'], 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'r', 'id': 1149, 'synset': 'turkey.n.01', 'synonyms': ['turkey_(bird)'], 'def': 'large gallinaceous bird with fan-shaped tail; widely domesticated for food', 'name': 'turkey_(bird)'}, {'frequency': 'c', 'id': 1150, 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'id': 1151, 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'id': 1152, 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'r', 'id': 1153, 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'r', 'id': 1154, 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'id': 1155, 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'c', 'id': 1156, 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'id': 1157, 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'c', 'id': 1158, 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'r', 'id': 1159, 'synset': 'urn.n.01', 'synonyms': ['urn'], 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'id': 1160, 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'c', 'id': 1161, 'synset': 'valve.n.03', 'synonyms': ['valve'], 'def': 'control consisting of a mechanical device for controlling the flow of a fluid', 'name': 'valve'}, {'frequency': 'f', 'id': 1162, 'synset': 'vase.n.01', 'synonyms': ['vase'], 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'id': 1163, 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'id': 1164, 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'c', 'id': 1165, 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'id': 1166, 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'id': 1167, 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'id': 1168, 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'r', 'id': 1169, 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'id': 1170, 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'id': 1171, 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'id': 1172, 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'id': 1173, 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'id': 1174, 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'id': 1175, 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'id': 1176, 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'id': 1177, 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'c', 'id': 1178, 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'id': 1179, 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'id': 1180, 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'id': 1181, 'synset': 'wasabi.n.02', 'synonyms': ['wasabi'], 'def': 'the thick green root of the wasabi plant that the Japanese use in cooking and that tastes like strong horseradish', 'name': 'wasabi'}, {'frequency': 'c', 'id': 1182, 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'id': 1183, 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'id': 1184, 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'id': 1185, 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'id': 1186, 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'id': 1187, 'synset': 'water_filter.n.01', 'synonyms': ['water_filter'], 'def': 'a filter to remove impurities from the water supply', 'name': 'water_filter'}, {'frequency': 'r', 'id': 1188, 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'r', 'id': 1189, 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'id': 1190, 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'id': 1191, 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'id': 1192, 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'id': 1193, 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'id': 1194, 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'c', 'id': 1195, 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'id': 1196, 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'id': 1197, 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'id': 1198, 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'id': 1199, 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'id': 1200, 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'id': 1201, 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'id': 1202, 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'id': 1203, 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'r', 'id': 1204, 'synset': 'whiskey.n.01', 'synonyms': ['whiskey'], 'def': 'a liquor made from fermented mash of grain', 'name': 'whiskey'}, {'frequency': 'r', 'id': 1205, 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'r', 'id': 1206, 'synset': 'wick.n.02', 'synonyms': ['wick'], 'def': 'a loosely woven cord in a candle or oil lamp that is lit on fire', 'name': 'wick'}, {'frequency': 'c', 'id': 1207, 'synset': 'wig.n.01', 'synonyms': ['wig'], 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'id': 1208, 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'id': 1209, 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'def': 'a mill that is powered by the wind', 'name': 'windmill'}, {'frequency': 'c', 'id': 1210, 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'id': 1211, 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'id': 1212, 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'id': 1213, 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'r', 'id': 1214, 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'id': 1215, 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'r', 'id': 1216, 'synset': 'wing_chair.n.01', 'synonyms': ['wing_chair'], 'def': 'easy chair having wings on each side of a high back', 'name': 'wing_chair'}, {'frequency': 'c', 'id': 1217, 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'id': 1218, 'synset': 'wok.n.01', 'synonyms': ['wok'], 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'id': 1219, 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'id': 1220, 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'id': 1221, 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'id': 1222, 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'c', 'id': 1223, 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'id': 1224, 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'r', 'id': 1225, 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'r', 'id': 1226, 'synset': 'yak.n.02', 'synonyms': ['yak'], 'def': 'large long-haired wild ox of Tibet often domesticated', 'name': 'yak'}, {'frequency': 'c', 'id': 1227, 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'r', 'id': 1228, 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'id': 1229, 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}] # noqa
+# fmt: on
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v1_categories.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v1_categories.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb768cf8267b20e690c3810b2e7f18751768b856
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v1_categories.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Autogen with
+# with open("lvis_v1_val.json", "r") as f:
+# a = json.load(f)
+# c = a["categories"]
+# for x in c:
+# del x["image_count"]
+# del x["instance_count"]
+# LVIS_CATEGORIES = repr(c) + " # noqa"
+# with open("/tmp/lvis_categories.py", "wt") as f:
+# f.write(f"LVIS_CATEGORIES = {LVIS_CATEGORIES}")
+# Then paste the contents of that file below
+
+# fmt: off
+LVIS_CATEGORIES = [{'frequency': 'c', 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'id': 1, 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'id': 2, 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'id': 3, 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'f', 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'id': 4, 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'id': 5, 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'c', 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'id': 6, 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'synset': 'almond.n.02', 'synonyms': ['almond'], 'id': 7, 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'id': 8, 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'c', 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'id': 9, 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'id': 10, 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'id': 11, 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'synset': 'apple.n.01', 'synonyms': ['apple'], 'id': 12, 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'id': 13, 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'id': 14, 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'synset': 'apron.n.01', 'synonyms': ['apron'], 'id': 15, 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'id': 16, 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'r', 'synset': 'arctic.n.02', 'synonyms': ['arctic_(type_of_shoe)', 'galosh', 'golosh', 'rubber_(type_of_shoe)', 'gumshoe'], 'id': 17, 'def': 'a waterproof overshoe that protects shoes from water or snow', 'name': 'arctic_(type_of_shoe)'}, {'frequency': 'c', 'synset': 'armband.n.02', 'synonyms': ['armband'], 'id': 18, 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'id': 19, 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'id': 20, 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'id': 21, 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'id': 22, 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'id': 23, 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'id': 24, 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'id': 25, 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'id': 26, 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'f', 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'id': 27, 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'id': 28, 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'synset': 'awning.n.01', 'synonyms': ['awning'], 'id': 29, 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'id': 30, 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'r', 'synset': 'baboon.n.01', 'synonyms': ['baboon'], 'id': 31, 'def': 'large terrestrial monkeys having doglike muzzles', 'name': 'baboon'}, {'frequency': 'f', 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'id': 32, 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'id': 33, 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'id': 34, 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'id': 35, 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'id': 36, 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'id': 37, 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'id': 38, 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'id': 39, 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'id': 40, 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'synset': 'ball.n.06', 'synonyms': ['ball'], 'id': 41, 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'id': 42, 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'id': 43, 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'id': 44, 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'synset': 'banana.n.02', 'synonyms': ['banana'], 'id': 45, 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'c', 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'id': 46, 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'id': 47, 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'f', 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'id': 48, 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'id': 49, 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'id': 50, 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'id': 51, 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'synset': 'barge.n.01', 'synonyms': ['barge'], 'id': 52, 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'id': 53, 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'id': 54, 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'id': 55, 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'id': 56, 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'id': 57, 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'id': 58, 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'id': 59, 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'id': 60, 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'id': 61, 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'id': 62, 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'id': 63, 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'c', 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'id': 64, 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'id': 65, 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'id': 66, 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'id': 67, 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'id': 68, 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'id': 69, 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'synset': 'battery.n.02', 'synonyms': ['battery'], 'id': 70, 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'id': 71, 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'synset': 'bead.n.01', 'synonyms': ['bead'], 'id': 72, 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'c', 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'id': 73, 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'id': 74, 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'id': 75, 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'synset': 'bear.n.01', 'synonyms': ['bear'], 'id': 76, 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'synset': 'bed.n.01', 'synonyms': ['bed'], 'id': 77, 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'r', 'synset': 'bedpan.n.01', 'synonyms': ['bedpan'], 'id': 78, 'def': 'a shallow vessel used by a bedridden patient for defecation and urination', 'name': 'bedpan'}, {'frequency': 'f', 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'id': 79, 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'synset': 'beef.n.01', 'synonyms': ['cow'], 'id': 80, 'def': 'cattle/cow', 'name': 'cow'}, {'frequency': 'f', 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'id': 81, 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'id': 82, 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'id': 83, 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'id': 84, 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'id': 85, 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'synset': 'bell.n.01', 'synonyms': ['bell'], 'id': 86, 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'id': 87, 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'synset': 'belt.n.02', 'synonyms': ['belt'], 'id': 88, 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'id': 89, 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'synset': 'bench.n.01', 'synonyms': ['bench'], 'id': 90, 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'synset': 'beret.n.01', 'synonyms': ['beret'], 'id': 91, 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'synset': 'bib.n.02', 'synonyms': ['bib'], 'id': 92, 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'id': 93, 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'id': 94, 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'id': 95, 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'f', 'synset': 'billboard.n.01', 'synonyms': ['billboard'], 'id': 96, 'def': 'large outdoor signboard', 'name': 'billboard'}, {'frequency': 'c', 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'id': 97, 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'id': 98, 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'synset': 'bird.n.01', 'synonyms': ['bird'], 'id': 99, 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'c', 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'id': 100, 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'c', 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'id': 101, 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'id': 102, 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'id': 103, 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'id': 104, 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'id': 105, 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'id': 106, 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'id': 107, 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'synset': 'blackberry.n.01', 'synonyms': ['blackberry'], 'id': 108, 'def': 'large sweet black or very dark purple edible aggregate fruit', 'name': 'blackberry'}, {'frequency': 'f', 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'id': 109, 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'id': 110, 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'id': 111, 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'id': 112, 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'id': 113, 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'f', 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'id': 114, 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'f', 'synset': 'blouse.n.01', 'synonyms': ['blouse'], 'id': 115, 'def': 'a top worn by women', 'name': 'blouse'}, {'frequency': 'f', 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'id': 116, 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'id': 117, 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'id': 118, 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'r', 'synset': 'bob.n.05', 'synonyms': ['bob', 'bobber', 'bobfloat'], 'id': 119, 'def': 'a small float usually made of cork; attached to a fishing line', 'name': 'bob'}, {'frequency': 'c', 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'id': 120, 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'c', 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'id': 121, 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'id': 122, 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'id': 123, 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'id': 124, 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'id': 125, 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'id': 126, 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'synset': 'book.n.01', 'synonyms': ['book'], 'id': 127, 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'c', 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'id': 128, 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'id': 129, 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'id': 130, 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'id': 131, 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'synset': 'boot.n.01', 'synonyms': ['boot'], 'id': 132, 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'id': 133, 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'id': 134, 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'id': 135, 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'id': 136, 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'id': 137, 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'id': 138, 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'id': 139, 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'id': 140, 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'id': 141, 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'id': 142, 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'f', 'synset': 'box.n.01', 'synonyms': ['box'], 'id': 143, 'def': 'a (usually rectangular) container; may have a lid', 'name': 'box'}, {'frequency': 'r', 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'id': 144, 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'id': 145, 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'id': 146, 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'id': 147, 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'id': 148, 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'id': 149, 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'f', 'synset': 'bread.n.01', 'synonyms': ['bread'], 'id': 150, 'def': 'food made from dough of flour or meal and usually raised with yeast or baking powder and then baked', 'name': 'bread'}, {'frequency': 'r', 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'id': 151, 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'f', 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'id': 152, 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'id': 153, 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'f', 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'id': 154, 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'id': 155, 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'synset': 'broom.n.01', 'synonyms': ['broom'], 'id': 156, 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'id': 157, 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'id': 158, 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'id': 159, 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'id': 160, 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'id': 161, 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'synset': 'bull.n.11', 'synonyms': ['horned_cow'], 'id': 162, 'def': 'a cow with horns', 'name': 'bull'}, {'frequency': 'c', 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'id': 163, 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'id': 164, 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'id': 165, 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'id': 166, 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'id': 167, 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'id': 168, 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'f', 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'id': 169, 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'id': 170, 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'id': 171, 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'id': 172, 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'id': 173, 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'id': 174, 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'f', 'synset': 'butter.n.01', 'synonyms': ['butter'], 'id': 175, 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'id': 176, 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'synset': 'button.n.01', 'synonyms': ['button'], 'id': 177, 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'id': 178, 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'id': 179, 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'c', 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'id': 180, 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'id': 181, 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'id': 182, 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'synset': 'cake.n.03', 'synonyms': ['cake'], 'id': 183, 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'id': 184, 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'id': 185, 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'synset': 'calf.n.01', 'synonyms': ['calf'], 'id': 186, 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'id': 187, 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'synset': 'camel.n.01', 'synonyms': ['camel'], 'id': 188, 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'synset': 'camera.n.01', 'synonyms': ['camera'], 'id': 189, 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'id': 190, 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'id': 191, 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'id': 192, 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'id': 193, 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'f', 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'id': 194, 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'id': 195, 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'id': 196, 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'id': 197, 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'id': 198, 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'id': 199, 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'c', 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'id': 200, 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'c', 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'id': 201, 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'id': 202, 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'f', 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'id': 203, 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'id': 204, 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'c', 'synset': 'cape.n.02', 'synonyms': ['cape'], 'id': 205, 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'id': 206, 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'id': 207, 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'id': 208, 'def': 'a wheeled vehicle adapted to the rails of railroad (mark each individual railcar separately)', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'id': 209, 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'id': 210, 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'id': 211, 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'synset': 'card.n.03', 'synonyms': ['card'], 'id': 212, 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'c', 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'id': 213, 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'id': 214, 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'id': 215, 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'id': 216, 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'id': 217, 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'f', 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'id': 218, 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'synset': 'cart.n.01', 'synonyms': ['cart'], 'id': 219, 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'synset': 'carton.n.02', 'synonyms': ['carton'], 'id': 220, 'def': 'a container made of cardboard for holding food or drink', 'name': 'carton'}, {'frequency': 'c', 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'id': 221, 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'id': 222, 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'id': 223, 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'id': 224, 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'synset': 'cat.n.01', 'synonyms': ['cat'], 'id': 225, 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'f', 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'id': 226, 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'c', 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'id': 227, 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'id': 228, 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'f', 'synset': 'celery.n.01', 'synonyms': ['celery'], 'id': 229, 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'id': 230, 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'id': 231, 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'synset': 'chair.n.01', 'synonyms': ['chair'], 'id': 232, 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'id': 233, 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'synset': 'chalice.n.01', 'synonyms': ['chalice'], 'id': 234, 'def': 'a bowl-shaped drinking vessel; especially the Eucharistic cup', 'name': 'chalice'}, {'frequency': 'f', 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'id': 235, 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'synset': 'chap.n.04', 'synonyms': ['chap'], 'id': 236, 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'id': 237, 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'id': 238, 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'id': 239, 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'id': 240, 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'c', 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'id': 241, 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'id': 242, 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'c', 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'id': 243, 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'id': 244, 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'id': 245, 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'id': 246, 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'id': 247, 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'id': 248, 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'id': 249, 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'id': 250, 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'id': 251, 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'id': 252, 'def': 'shirt collar, animal collar, or tight-fitting necklace', 'name': 'choker'}, {'frequency': 'f', 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'id': 253, 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'f', 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'id': 254, 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'id': 255, 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'synset': 'chute.n.02', 'synonyms': ['slide'], 'id': 256, 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'id': 257, 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'id': 258, 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'f', 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'id': 259, 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'id': 260, 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'id': 261, 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'id': 262, 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'c', 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'id': 263, 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'id': 264, 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'synset': 'cleat.n.02', 'synonyms': ['cleat_(for_securing_rope)'], 'id': 265, 'def': 'a fastener (usually with two projecting horns) around which a rope can be secured', 'name': 'cleat_(for_securing_rope)'}, {'frequency': 'r', 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'id': 266, 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'synset': 'clip.n.03', 'synonyms': ['clip'], 'id': 267, 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'id': 268, 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'r', 'synset': 'clipper.n.03', 'synonyms': ['clippers_(for_plants)'], 'id': 269, 'def': 'shears for cutting grass or shrubbery (often used in the plural)', 'name': 'clippers_(for_plants)'}, {'frequency': 'r', 'synset': 'cloak.n.02', 'synonyms': ['cloak'], 'id': 270, 'def': 'a loose outer garment', 'name': 'cloak'}, {'frequency': 'f', 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'id': 271, 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'id': 272, 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'id': 273, 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'id': 274, 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'id': 275, 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'id': 276, 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'synset': 'coat.n.01', 'synonyms': ['coat'], 'id': 277, 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'id': 278, 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'c', 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'id': 279, 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'id': 280, 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'r', 'synset': 'cockroach.n.01', 'synonyms': ['cockroach'], 'id': 281, 'def': 'any of numerous chiefly nocturnal insects; some are domestic pests', 'name': 'cockroach'}, {'frequency': 'r', 'synset': 'cocoa.n.01', 'synonyms': ['cocoa_(beverage)', 'hot_chocolate_(beverage)', 'drinking_chocolate'], 'id': 282, 'def': 'a beverage made from cocoa powder and milk and sugar; usually drunk hot', 'name': 'cocoa_(beverage)'}, {'frequency': 'c', 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'id': 283, 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'f', 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'id': 284, 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'id': 285, 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'id': 286, 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'synset': 'coil.n.05', 'synonyms': ['coil'], 'id': 287, 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'synset': 'coin.n.01', 'synonyms': ['coin'], 'id': 288, 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'c', 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'id': 289, 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'id': 290, 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'id': 291, 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'id': 292, 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'id': 293, 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'id': 294, 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'r', 'synset': 'compass.n.01', 'synonyms': ['compass'], 'id': 295, 'def': 'navigational instrument for finding directions', 'name': 'compass'}, {'frequency': 'f', 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'id': 296, 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'f', 'synset': 'condiment.n.01', 'synonyms': ['condiment'], 'id': 297, 'def': 'a preparation (a sauce or relish or spice) to enhance flavor or enjoyment', 'name': 'condiment'}, {'frequency': 'f', 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'id': 298, 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'id': 299, 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'id': 300, 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'id': 301, 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'r', 'synset': 'cooker.n.01', 'synonyms': ['cooker'], 'id': 302, 'def': 'a utensil for cooking', 'name': 'cooker'}, {'frequency': 'f', 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'id': 303, 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'id': 304, 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'id': 305, 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'f', 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'id': 306, 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'id': 307, 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'c', 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'id': 308, 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'f', 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'id': 309, 'def': 'ears or kernels of corn that can be prepared and served for human food (only mark individual ears or kernels)', 'name': 'edible_corn'}, {'frequency': 'r', 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'id': 310, 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'id': 311, 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'id': 312, 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'id': 313, 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'c', 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'id': 314, 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'c', 'synset': 'costume.n.04', 'synonyms': ['costume'], 'id': 315, 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'id': 316, 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'id': 317, 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'c', 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'id': 318, 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'id': 319, 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'c', 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'id': 320, 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'r', 'synset': 'crab.n.05', 'synonyms': ['crabmeat'], 'id': 321, 'def': 'the edible flesh of any of various crabs', 'name': 'crabmeat'}, {'frequency': 'c', 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'id': 322, 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'id': 323, 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'synset': 'crate.n.01', 'synonyms': ['crate'], 'id': 324, 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'c', 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'id': 325, 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'id': 326, 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'c', 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'id': 327, 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'id': 328, 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'id': 329, 'def': 'an earthen jar (made of baked clay) or a modern electric crockpot', 'name': 'crock_pot'}, {'frequency': 'f', 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'id': 330, 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'id': 331, 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'c', 'synset': 'crow.n.01', 'synonyms': ['crow'], 'id': 332, 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'r', 'synset': 'crowbar.n.01', 'synonyms': ['crowbar', 'wrecking_bar', 'pry_bar'], 'id': 333, 'def': 'a heavy iron lever with one end forged into a wedge', 'name': 'crowbar'}, {'frequency': 'c', 'synset': 'crown.n.04', 'synonyms': ['crown'], 'id': 334, 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'id': 335, 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'id': 336, 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'id': 337, 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'f', 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'id': 338, 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'c', 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'id': 339, 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'id': 340, 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'c', 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'id': 341, 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'id': 342, 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'id': 343, 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'synset': 'cup.n.01', 'synonyms': ['cup'], 'id': 344, 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'id': 345, 'def': 'a metal award or cup-shaped vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'f', 'synset': 'cupboard.n.01', 'synonyms': ['cupboard', 'closet'], 'id': 346, 'def': 'a small room (or recess) or cabinet used for storage space', 'name': 'cupboard'}, {'frequency': 'f', 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'id': 347, 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'id': 348, 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'id': 349, 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'id': 350, 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'id': 351, 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'id': 352, 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'id': 353, 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'id': 354, 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'synset': 'dalmatian.n.02', 'synonyms': ['dalmatian'], 'id': 355, 'def': 'a large breed having a smooth white coat with black or brown spots', 'name': 'dalmatian'}, {'frequency': 'c', 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'id': 356, 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'id': 357, 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'id': 358, 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'id': 359, 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'id': 360, 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'synset': 'desk.n.01', 'synonyms': ['desk'], 'id': 361, 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'id': 362, 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'id': 363, 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'id': 364, 'def': 'yearly planner book', 'name': 'diary'}, {'frequency': 'r', 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'id': 365, 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'id': 366, 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'id': 367, 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'id': 368, 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'f', 'synset': 'dish.n.01', 'synonyms': ['dish'], 'id': 369, 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'id': 370, 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'id': 371, 'def': 'a cloth for washing dishes or cleaning in general', 'name': 'dishrag'}, {'frequency': 'f', 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'id': 372, 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'id': 373, 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid', 'dishsoap'], 'id': 374, 'def': 'dishsoap or dish detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'f', 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'id': 375, 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'r', 'synset': 'diving_board.n.01', 'synonyms': ['diving_board'], 'id': 376, 'def': 'a springboard from which swimmers can dive', 'name': 'diving_board'}, {'frequency': 'f', 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'id': 377, 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'synset': 'dog.n.01', 'synonyms': ['dog'], 'id': 378, 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'id': 379, 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'f', 'synset': 'doll.n.01', 'synonyms': ['doll'], 'id': 380, 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'id': 381, 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'synset': 'dollhouse.n.01', 'synonyms': ['dollhouse', "doll's_house"], 'id': 382, 'def': "a house so small that it is likened to a child's plaything", 'name': 'dollhouse'}, {'frequency': 'c', 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'id': 383, 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'id': 384, 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'f', 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'id': 385, 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'id': 386, 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'id': 387, 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'synset': 'dove.n.01', 'synonyms': ['dove'], 'id': 388, 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'id': 389, 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'id': 390, 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'id': 391, 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'id': 392, 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'id': 393, 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'f', 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'id': 394, 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'f', 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'id': 395, 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'synset': 'drill.n.01', 'synonyms': ['drill'], 'id': 396, 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'synset': 'drone.n.04', 'synonyms': ['drone'], 'id': 397, 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'id': 398, 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'id': 399, 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'id': 400, 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'synset': 'duck.n.01', 'synonyms': ['duck'], 'id': 401, 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'c', 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'id': 402, 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'id': 403, 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'id': 404, 'def': 'a large cylindrical bag of heavy cloth (does not include suitcases)', 'name': 'duffel_bag'}, {'frequency': 'r', 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'id': 405, 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'id': 406, 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'id': 407, 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'c', 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'id': 408, 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'id': 409, 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'id': 410, 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'synset': 'earring.n.01', 'synonyms': ['earring'], 'id': 411, 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'synset': 'easel.n.01', 'synonyms': ['easel'], 'id': 412, 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'id': 413, 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'synset': 'eel.n.01', 'synonyms': ['eel'], 'id': 414, 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'id': 415, 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'id': 416, 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'id': 417, 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'id': 418, 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'id': 419, 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'id': 420, 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'id': 421, 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'id': 422, 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'c', 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'id': 423, 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'id': 424, 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'id': 425, 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'id': 426, 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'id': 427, 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'id': 428, 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'synset': 'fan.n.01', 'synonyms': ['fan'], 'id': 429, 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'id': 430, 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'id': 431, 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'id': 432, 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'id': 433, 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'c', 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'id': 434, 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'id': 435, 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'id': 436, 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'id': 437, 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'id': 438, 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'id': 439, 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'id': 440, 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'f', 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'id': 441, 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'f', 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'id': 442, 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'id': 443, 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'id': 444, 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'id': 445, 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'r', 'synset': 'first-aid_kit.n.01', 'synonyms': ['first-aid_kit'], 'id': 446, 'def': 'kit consisting of a set of bandages and medicines for giving first aid', 'name': 'first-aid_kit'}, {'frequency': 'f', 'synset': 'fish.n.01', 'synonyms': ['fish'], 'id': 447, 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'c', 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'id': 448, 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'id': 449, 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'c', 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'id': 450, 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'synset': 'flag.n.01', 'synonyms': ['flag'], 'id': 451, 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'id': 452, 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'id': 453, 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'id': 454, 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'c', 'synset': 'flap.n.01', 'synonyms': ['flap'], 'id': 455, 'def': 'any broad thin covering attached at one edge, such as a mud flap next to a wheel or a flap on an airplane wing', 'name': 'flap'}, {'frequency': 'r', 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'id': 456, 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'id': 457, 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'id': 458, 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'id': 459, 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'id': 460, 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'id': 461, 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'id': 462, 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'c', 'synset': 'foal.n.01', 'synonyms': ['foal'], 'id': 463, 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'id': 464, 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'id': 465, 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'id': 466, 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'id': 467, 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'id': 468, 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'synset': 'fork.n.01', 'synonyms': ['fork'], 'id': 469, 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'c', 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'id': 470, 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'c', 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'id': 471, 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'c', 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'id': 472, 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'id': 473, 'def': 'anything that freshens air by removing or covering odor', 'name': 'freshener'}, {'frequency': 'f', 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'id': 474, 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'id': 475, 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'id': 476, 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'f', 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'id': 477, 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'id': 478, 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'id': 479, 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'r', 'synset': 'futon.n.01', 'synonyms': ['futon'], 'id': 480, 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'id': 481, 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'id': 482, 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'id': 483, 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'id': 484, 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'id': 485, 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'id': 486, 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'id': 487, 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'id': 488, 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'c', 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'id': 489, 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'id': 490, 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'id': 491, 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'r', 'synset': 'generator.n.02', 'synonyms': ['generator'], 'id': 492, 'def': 'engine that converts mechanical energy into electrical energy by electromagnetic induction', 'name': 'generator'}, {'frequency': 'c', 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'id': 493, 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'id': 494, 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'id': 495, 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'id': 496, 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'id': 497, 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'id': 498, 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'synset': 'globe.n.03', 'synonyms': ['globe'], 'id': 499, 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'synset': 'glove.n.02', 'synonyms': ['glove'], 'id': 500, 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'synset': 'goat.n.01', 'synonyms': ['goat'], 'id': 501, 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'id': 502, 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'id': 503, 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'c', 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'id': 504, 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'id': 505, 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'id': 506, 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'synset': 'goose.n.01', 'synonyms': ['goose'], 'id': 507, 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'id': 508, 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'id': 509, 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'f', 'synset': 'grape.n.01', 'synonyms': ['grape'], 'id': 510, 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'c', 'synset': 'grater.n.01', 'synonyms': ['grater'], 'id': 511, 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'id': 512, 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'id': 513, 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'f', 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'id': 514, 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'f', 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'id': 515, 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'id': 516, 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'f', 'synset': 'grill.n.02', 'synonyms': ['grill', 'grille', 'grillwork', 'radiator_grille'], 'id': 517, 'def': 'a framework of metal bars used as a partition or a grate', 'name': 'grill'}, {'frequency': 'r', 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'id': 518, 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'id': 519, 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'id': 520, 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'f', 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'id': 521, 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'id': 522, 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'synset': 'gun.n.01', 'synonyms': ['gun'], 'id': 523, 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'f', 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'id': 524, 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'id': 525, 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'id': 526, 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'r', 'synset': 'halter.n.03', 'synonyms': ['halter_top'], 'id': 527, 'def': "a woman's top that fastens behind the back and neck leaving the back and arms uncovered", 'name': 'halter_top'}, {'frequency': 'f', 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'id': 528, 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'id': 529, 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'id': 530, 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'c', 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'id': 531, 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'id': 532, 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'c', 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'id': 533, 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'f', 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'id': 534, 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'id': 535, 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'id': 536, 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'id': 537, 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'id': 538, 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'id': 539, 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'id': 540, 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'id': 541, 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'id': 542, 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'id': 543, 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'synset': 'hat.n.01', 'synonyms': ['hat'], 'id': 544, 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'id': 545, 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'c', 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'id': 546, 'def': 'a garment that covers the head OR face', 'name': 'veil'}, {'frequency': 'f', 'synset': 'headband.n.01', 'synonyms': ['headband'], 'id': 547, 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'id': 548, 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'id': 549, 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'id': 550, 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'synset': 'headset.n.01', 'synonyms': ['headset'], 'id': 551, 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'id': 552, 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'c', 'synset': 'heart.n.02', 'synonyms': ['heart'], 'id': 553, 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'id': 554, 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'id': 555, 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'id': 556, 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'synset': 'heron.n.02', 'synonyms': ['heron'], 'id': 557, 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'id': 558, 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'id': 559, 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'id': 560, 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'id': 561, 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'id': 562, 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'id': 563, 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'synset': 'honey.n.01', 'synonyms': ['honey'], 'id': 564, 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'id': 565, 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'synset': 'hook.n.05', 'synonyms': ['hook'], 'id': 566, 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'r', 'synset': 'hookah.n.01', 'synonyms': ['hookah', 'narghile', 'nargileh', 'sheesha', 'shisha', 'water_pipe'], 'id': 567, 'def': 'a tobacco pipe with a long flexible tube connected to a container where the smoke is cooled by passing through water', 'name': 'hookah'}, {'frequency': 'r', 'synset': 'hornet.n.01', 'synonyms': ['hornet'], 'id': 568, 'def': 'large stinging wasp', 'name': 'hornet'}, {'frequency': 'f', 'synset': 'horse.n.01', 'synonyms': ['horse'], 'id': 569, 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'id': 570, 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'id': 571, 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'id': 572, 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'id': 573, 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'id': 574, 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'id': 575, 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'c', 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'id': 576, 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'id': 577, 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'f', 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'id': 578, 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'id': 579, 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'id': 580, 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'id': 581, 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'id': 582, 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'id': 583, 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'c', 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'id': 584, 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'id': 585, 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'f', 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'id': 586, 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'id': 587, 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'c', 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'id': 588, 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'id': 589, 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'c', 'synset': 'jam.n.01', 'synonyms': ['jam'], 'id': 590, 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'synset': 'jar.n.01', 'synonyms': ['jar'], 'id': 591, 'def': 'a vessel (usually cylindrical) with a wide mouth and without handles', 'name': 'jar'}, {'frequency': 'f', 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'id': 592, 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'id': 593, 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'id': 594, 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'id': 595, 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'id': 596, 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'r', 'synset': 'jewel.n.01', 'synonyms': ['jewel', 'gem', 'precious_stone'], 'id': 597, 'def': 'a precious or semiprecious stone incorporated into a piece of jewelry', 'name': 'jewel'}, {'frequency': 'c', 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'id': 598, 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'id': 599, 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'c', 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'id': 600, 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'id': 601, 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'synset': 'keg.n.02', 'synonyms': ['keg'], 'id': 602, 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'id': 603, 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'id': 604, 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'synset': 'key.n.01', 'synonyms': ['key'], 'id': 605, 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'id': 606, 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'c', 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'id': 607, 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'id': 608, 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'id': 609, 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'r', 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'id': 610, 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'synset': 'kite.n.03', 'synonyms': ['kite'], 'id': 611, 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'id': 612, 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'id': 613, 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'id': 614, 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'synset': 'knife.n.01', 'synonyms': ['knife'], 'id': 615, 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'id': 616, 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'synset': 'knob.n.02', 'synonyms': ['knob'], 'id': 617, 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'id': 618, 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'id': 619, 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'id': 620, 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'id': 621, 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'id': 622, 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'c', 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'id': 623, 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'f', 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'id': 624, 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'id': 625, 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'id': 626, 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'id': 627, 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'id': 628, 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'id': 629, 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'id': 630, 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'id': 631, 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'id': 632, 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'f', 'synset': 'latch.n.02', 'synonyms': ['latch'], 'id': 633, 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'id': 634, 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'synset': 'leather.n.01', 'synonyms': ['leather'], 'id': 635, 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'id': 636, 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'id': 637, 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'r', 'synset': 'legume.n.02', 'synonyms': ['legume'], 'id': 638, 'def': 'the fruit or seed of bean or pea plants', 'name': 'legume'}, {'frequency': 'f', 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'id': 639, 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'id': 640, 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'id': 641, 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'id': 642, 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'id': 643, 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'id': 644, 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'id': 645, 'def': 'lightblub/source of light', 'name': 'lightbulb'}, {'frequency': 'r', 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'id': 646, 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'f', 'synset': 'lime.n.06', 'synonyms': ['lime'], 'id': 647, 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'id': 648, 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'c', 'synset': 'lion.n.01', 'synonyms': ['lion'], 'id': 649, 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'id': 650, 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'r', 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'id': 651, 'def': 'liquor or beer', 'name': 'liquor'}, {'frequency': 'c', 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'id': 652, 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'f', 'synset': 'log.n.01', 'synonyms': ['log'], 'id': 653, 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'id': 654, 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'f', 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'id': 655, 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'id': 656, 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'id': 657, 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'id': 658, 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'id': 659, 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'c', 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'id': 660, 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'f', 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'id': 661, 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'synset': 'mallard.n.01', 'synonyms': ['mallard'], 'id': 662, 'def': 'wild dabbling duck from which domestic ducks are descended', 'name': 'mallard'}, {'frequency': 'r', 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'id': 663, 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'id': 664, 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'r', 'synset': 'manatee.n.01', 'synonyms': ['manatee'], 'id': 665, 'def': 'sirenian mammal of tropical coastal waters of America', 'name': 'manatee'}, {'frequency': 'c', 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'id': 666, 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'id': 667, 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'id': 668, 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'f', 'synset': 'map.n.01', 'synonyms': ['map'], 'id': 669, 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'f', 'synset': 'marker.n.03', 'synonyms': ['marker'], 'id': 670, 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'synset': 'martini.n.01', 'synonyms': ['martini'], 'id': 671, 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'id': 672, 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'id': 673, 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'synset': 'masher.n.02', 'synonyms': ['masher'], 'id': 674, 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'id': 675, 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'synset': 'mast.n.01', 'synonyms': ['mast'], 'id': 676, 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'id': 677, 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'id': 678, 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'id': 679, 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'id': 680, 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'id': 681, 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'id': 682, 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'id': 683, 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'c', 'synset': 'melon.n.01', 'synonyms': ['melon'], 'id': 684, 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'id': 685, 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'id': 686, 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'id': 687, 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'id': 688, 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'f', 'synset': 'milk.n.01', 'synonyms': ['milk'], 'id': 689, 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'r', 'synset': 'milk_can.n.01', 'synonyms': ['milk_can'], 'id': 690, 'def': 'can for transporting milk', 'name': 'milk_can'}, {'frequency': 'r', 'synset': 'milkshake.n.01', 'synonyms': ['milkshake'], 'id': 691, 'def': 'frothy drink of milk and flavoring and sometimes fruit or ice cream', 'name': 'milkshake'}, {'frequency': 'f', 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'id': 692, 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'id': 693, 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'id': 694, 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'id': 695, 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'id': 696, 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'synset': 'money.n.03', 'synonyms': ['money'], 'id': 697, 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'id': 698, 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'id': 699, 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'synset': 'motor.n.01', 'synonyms': ['motor'], 'id': 700, 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'id': 701, 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'id': 702, 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'f', 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'id': 703, 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'id': 704, 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'f', 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'id': 705, 'def': 'a computer input device that controls an on-screen pointer (does not include trackpads / touchpads)', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'id': 706, 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'id': 707, 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'synset': 'mug.n.04', 'synonyms': ['mug'], 'id': 708, 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'id': 709, 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'id': 710, 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'c', 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'id': 711, 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'id': 712, 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'f', 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'id': 713, 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'id': 714, 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'id': 715, 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'id': 716, 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'c', 'synset': 'needle.n.03', 'synonyms': ['needle'], 'id': 717, 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'synset': 'nest.n.01', 'synonyms': ['nest'], 'id': 718, 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'f', 'synset': 'newspaper.n.01', 'synonyms': ['newspaper', 'paper_(newspaper)'], 'id': 719, 'def': 'a daily or weekly publication on folded sheets containing news, articles, and advertisements', 'name': 'newspaper'}, {'frequency': 'c', 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'id': 720, 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'id': 721, 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'id': 722, 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'c', 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'id': 723, 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'id': 724, 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'id': 725, 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'f', 'synset': 'nut.n.03', 'synonyms': ['nut'], 'id': 726, 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'id': 727, 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'f', 'synset': 'oar.n.01', 'synonyms': ['oar'], 'id': 728, 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'id': 729, 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'id': 730, 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'id': 731, 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'id': 732, 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'id': 733, 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'synset': 'onion.n.01', 'synonyms': ['onion'], 'id': 734, 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'id': 735, 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'id': 736, 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'c', 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'id': 737, 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'f', 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'id': 738, 'def': 'a thick standalone cushion used as a seat or footrest, often next to a chair', 'name': 'ottoman'}, {'frequency': 'f', 'synset': 'oven.n.01', 'synonyms': ['oven'], 'id': 739, 'def': 'kitchen appliance used for baking or roasting', 'name': 'oven'}, {'frequency': 'c', 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'id': 740, 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'synset': 'owl.n.01', 'synonyms': ['owl'], 'id': 741, 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'synset': 'packet.n.03', 'synonyms': ['packet'], 'id': 742, 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'id': 743, 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'synset': 'pad.n.04', 'synonyms': ['pad'], 'id': 744, 'def': 'mostly arm/knee pads labeled', 'name': 'pad'}, {'frequency': 'f', 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'id': 745, 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'id': 746, 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'c', 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'id': 747, 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'synset': 'painting.n.01', 'synonyms': ['painting'], 'id': 748, 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'f', 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'id': 749, 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'id': 750, 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'id': 751, 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'id': 752, 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'id': 753, 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'id': 754, 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'id': 755, 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'f', 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'id': 756, 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'id': 757, 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'id': 758, 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'id': 759, 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'id': 760, 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'c', 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'id': 761, 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'id': 762, 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'c', 'synset': 'parasol.n.01', 'synonyms': ['parasol', 'sunshade'], 'id': 763, 'def': 'a handheld collapsible source of shade', 'name': 'parasol'}, {'frequency': 'r', 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'id': 764, 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'c', 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'id': 765, 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'id': 766, 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'id': 767, 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'id': 768, 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'id': 769, 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'c', 'synset': 'passport.n.02', 'synonyms': ['passport'], 'id': 770, 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'id': 771, 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'id': 772, 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'id': 773, 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'synset': 'peach.n.03', 'synonyms': ['peach'], 'id': 774, 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'id': 775, 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'f', 'synset': 'pear.n.01', 'synonyms': ['pear'], 'id': 776, 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'c', 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'id': 777, 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'synset': 'peg.n.04', 'synonyms': ['wooden_leg', 'pegleg'], 'id': 778, 'def': 'a prosthesis that replaces a missing leg', 'name': 'wooden_leg'}, {'frequency': 'r', 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'id': 779, 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'id': 780, 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'synset': 'pen.n.01', 'synonyms': ['pen'], 'id': 781, 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'f', 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'id': 782, 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'id': 783, 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'id': 784, 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'id': 785, 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'id': 786, 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'id': 787, 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'id': 788, 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'f', 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'id': 789, 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'id': 790, 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'id': 791, 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'id': 792, 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'synset': 'person.n.01', 'synonyms': ['person', 'baby', 'child', 'boy', 'girl', 'man', 'woman', 'human'], 'id': 793, 'def': 'a human being', 'name': 'person'}, {'frequency': 'c', 'synset': 'pet.n.01', 'synonyms': ['pet'], 'id': 794, 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'c', 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'id': 795, 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'id': 796, 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'id': 797, 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'f', 'synset': 'piano.n.01', 'synonyms': ['piano'], 'id': 798, 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'id': 799, 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'id': 800, 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'synset': 'pie.n.01', 'synonyms': ['pie'], 'id': 801, 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'id': 802, 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'id': 803, 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'id': 804, 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'id': 805, 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'id': 806, 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'id': 807, 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'id': 808, 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'id': 809, 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'id': 810, 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'id': 811, 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'id': 812, 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'c', 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'id': 813, 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'id': 814, 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'id': 815, 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'id': 816, 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'id': 817, 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'synset': 'plate.n.04', 'synonyms': ['plate'], 'id': 818, 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'synset': 'platter.n.01', 'synonyms': ['platter'], 'id': 819, 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'id': 820, 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'id': 821, 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'id': 822, 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'synset': 'plume.n.02', 'synonyms': ['plume'], 'id': 823, 'def': 'a feather or cluster of feathers worn as an ornament', 'name': 'plume'}, {'frequency': 'r', 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'id': 824, 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'id': 825, 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'id': 826, 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'id': 827, 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'f', 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'id': 828, 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'id': 829, 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'synset': 'pony.n.05', 'synonyms': ['pony'], 'id': 830, 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'id': 831, 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'id': 832, 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'c', 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'id': 833, 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'id': 834, 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'id': 835, 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'synset': 'pot.n.01', 'synonyms': ['pot'], 'id': 836, 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'id': 837, 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'synset': 'potato.n.01', 'synonyms': ['potato'], 'id': 838, 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'id': 839, 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'id': 840, 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'id': 841, 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'c', 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'id': 842, 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'id': 843, 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'c', 'synset': 'pretzel.n.01', 'synonyms': ['pretzel'], 'id': 844, 'def': 'glazed and salted cracker typically in the shape of a loose knot', 'name': 'pretzel'}, {'frequency': 'f', 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'id': 845, 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'id': 846, 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'synset': 'projector.n.02', 'synonyms': ['projector'], 'id': 847, 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'id': 848, 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'synset': 'prune.n.01', 'synonyms': ['prune'], 'id': 849, 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'id': 850, 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'id': 851, 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'id': 852, 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'id': 853, 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'id': 854, 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'id': 855, 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'id': 856, 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'c', 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'id': 857, 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'id': 858, 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'id': 859, 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'id': 860, 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'id': 861, 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'id': 862, 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'id': 863, 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'synset': 'radar.n.01', 'synonyms': ['radar'], 'id': 864, 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'f', 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'id': 865, 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'id': 866, 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'id': 867, 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'synset': 'raft.n.01', 'synonyms': ['raft'], 'id': 868, 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'id': 869, 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'id': 870, 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'id': 871, 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'id': 872, 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'synset': 'rat.n.01', 'synonyms': ['rat'], 'id': 873, 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'id': 874, 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'id': 875, 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'id': 876, 'def': 'vehicle mirror (side or rearview)', 'name': 'rearview_mirror'}, {'frequency': 'c', 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'id': 877, 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'id': 878, 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'c', 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'id': 879, 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'f', 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'id': 880, 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'id': 881, 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'id': 882, 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'id': 883, 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'c', 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'id': 884, 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'synset': 'ring.n.08', 'synonyms': ['ring'], 'id': 885, 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'id': 886, 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'id': 887, 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'synset': 'robe.n.01', 'synonyms': ['robe'], 'id': 888, 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'id': 889, 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'synset': 'rodent.n.01', 'synonyms': ['rodent'], 'id': 890, 'def': 'relatively small placental mammals having a single pair of constantly growing incisor teeth specialized for gnawing', 'name': 'rodent'}, {'frequency': 'r', 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'id': 891, 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'id': 892, 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'id': 893, 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'id': 894, 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'id': 895, 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'id': 896, 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'id': 897, 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'id': 898, 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'id': 899, 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'id': 900, 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'id': 901, 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'id': 902, 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'f', 'synset': 'sail.n.01', 'synonyms': ['sail'], 'id': 903, 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'f', 'synset': 'salad.n.01', 'synonyms': ['salad'], 'id': 904, 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'id': 905, 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'c', 'synset': 'salami.n.01', 'synonyms': ['salami'], 'id': 906, 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'c', 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'id': 907, 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'id': 908, 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'c', 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'id': 909, 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'id': 910, 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'id': 911, 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'id': 912, 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'id': 913, 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'id': 914, 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'id': 915, 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'id': 916, 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'id': 917, 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'id': 918, 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'id': 919, 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'id': 920, 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'id': 921, 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'id': 922, 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'id': 923, 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'f', 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'id': 924, 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'r', 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'id': 925, 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'c', 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'id': 926, 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'f', 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'id': 927, 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'id': 928, 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'c', 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'id': 929, 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'c', 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'id': 930, 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'id': 931, 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'id': 932, 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'c', 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'id': 933, 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'c', 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'id': 934, 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'id': 935, 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'c', 'synset': 'shark.n.01', 'synonyms': ['shark'], 'id': 936, 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'id': 937, 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'id': 938, 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'id': 939, 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'id': 940, 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'id': 941, 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'synset': 'shears.n.01', 'synonyms': ['shears'], 'id': 942, 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'id': 943, 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'id': 944, 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'id': 945, 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'c', 'synset': 'shield.n.02', 'synonyms': ['shield'], 'id': 946, 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'id': 947, 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'id': 948, 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'f', 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'id': 949, 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'id': 950, 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'id': 951, 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'id': 952, 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'f', 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'id': 953, 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'id': 954, 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'id': 955, 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'r', 'synset': 'shower_cap.n.01', 'synonyms': ['shower_cap'], 'id': 956, 'def': 'a tight cap worn to keep hair dry while showering', 'name': 'shower_cap'}, {'frequency': 'f', 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'id': 957, 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'id': 958, 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'f', 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'id': 959, 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'synset': 'silo.n.01', 'synonyms': ['silo'], 'id': 960, 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'synset': 'sink.n.01', 'synonyms': ['sink'], 'id': 961, 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'id': 962, 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'id': 963, 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'synset': 'ski.n.01', 'synonyms': ['ski'], 'id': 964, 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'id': 965, 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'id': 966, 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'id': 967, 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'id': 968, 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'r', 'synset': 'skullcap.n.01', 'synonyms': ['skullcap'], 'id': 969, 'def': 'rounded brimless cap fitting the crown of the head', 'name': 'skullcap'}, {'frequency': 'c', 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'id': 970, 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'id': 971, 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'id': 972, 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'id': 973, 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'id': 974, 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'id': 975, 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'id': 976, 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'id': 977, 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'id': 978, 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'synset': 'soap.n.01', 'synonyms': ['soap'], 'id': 979, 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'id': 980, 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'synset': 'sock.n.01', 'synonyms': ['sock'], 'id': 981, 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'f', 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'id': 982, 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'synset': 'softball.n.01', 'synonyms': ['softball'], 'id': 983, 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'id': 984, 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'id': 985, 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'f', 'synset': 'soup.n.01', 'synonyms': ['soup'], 'id': 986, 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'id': 987, 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'id': 988, 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'id': 989, 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'id': 990, 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'id': 991, 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'id': 992, 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'id': 993, 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'id': 994, 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'id': 995, 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'id': 996, 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'c', 'synset': 'spider.n.01', 'synonyms': ['spider'], 'id': 997, 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'r', 'synset': 'spiny_lobster.n.02', 'synonyms': ['crawfish', 'crayfish'], 'id': 998, 'def': 'large edible marine crustacean having a spiny carapace but lacking the large pincers of true lobsters', 'name': 'crawfish'}, {'frequency': 'c', 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'id': 999, 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'id': 1000, 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'id': 1001, 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'id': 1002, 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'synset': 'squid.n.01', 'synonyms': ['squid_(food)', 'calamari', 'calamary'], 'id': 1003, 'def': '(Italian cuisine) squid prepared as food', 'name': 'squid_(food)'}, {'frequency': 'c', 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'id': 1004, 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'r', 'synset': 'stagecoach.n.01', 'synonyms': ['stagecoach'], 'id': 1005, 'def': 'a large coach-and-four formerly used to carry passengers and mail on regular routes between towns', 'name': 'stagecoach'}, {'frequency': 'c', 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'id': 1006, 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'c', 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'id': 1007, 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'id': 1008, 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'id': 1009, 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'id': 1010, 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'f', 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'id': 1011, 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'id': 1012, 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'id': 1013, 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'id': 1014, 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'synset': 'stew.n.02', 'synonyms': ['stew'], 'id': 1015, 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'id': 1016, 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'id': 1017, 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'f', 'synset': 'stool.n.01', 'synonyms': ['stool'], 'id': 1018, 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'id': 1019, 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'id': 1020, 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'id': 1021, 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'id': 1022, 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'synset': 'strap.n.01', 'synonyms': ['strap'], 'id': 1023, 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'id': 1024, 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'id': 1025, 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'id': 1026, 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'id': 1027, 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'id': 1028, 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'id': 1029, 'def': 'a pointed tool for writing or drawing or engraving, including pens', 'name': 'stylus'}, {'frequency': 'r', 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'id': 1030, 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'id': 1031, 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'id': 1032, 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'f', 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'id': 1033, 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'id': 1034, 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'id': 1035, 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'id': 1036, 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'f', 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'id': 1037, 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'id': 1038, 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'synset': 'swab.n.02', 'synonyms': ['mop'], 'id': 1039, 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'id': 1040, 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'id': 1041, 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'id': 1042, 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'id': 1043, 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'id': 1044, 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'id': 1045, 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'synset': 'sword.n.01', 'synonyms': ['sword'], 'id': 1046, 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'id': 1047, 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'id': 1048, 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'id': 1049, 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'synset': 'table.n.02', 'synonyms': ['table'], 'id': 1050, 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'id': 1051, 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'id': 1052, 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'id': 1053, 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'synset': 'taco.n.02', 'synonyms': ['taco'], 'id': 1054, 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'synset': 'tag.n.02', 'synonyms': ['tag'], 'id': 1055, 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'id': 1056, 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'id': 1057, 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'id': 1058, 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'f', 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'id': 1059, 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'id': 1060, 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'f', 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'id': 1061, 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'id': 1062, 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'id': 1063, 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'id': 1064, 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'id': 1065, 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'id': 1066, 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'c', 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'id': 1067, 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'id': 1068, 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'id': 1069, 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'f', 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'id': 1070, 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'id': 1071, 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'id': 1072, 'def': 'electronic device for communicating by voice over long distances (includes wired and wireless/cell phones)', 'name': 'telephone'}, {'frequency': 'c', 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'id': 1073, 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'id': 1074, 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'id': 1075, 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'id': 1076, 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'id': 1077, 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'id': 1078, 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'id': 1079, 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'id': 1080, 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'id': 1081, 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'id': 1082, 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'f', 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'id': 1083, 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'id': 1084, 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'id': 1085, 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'id': 1086, 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'id': 1087, 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'id': 1088, 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'id': 1089, 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'id': 1090, 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'id': 1091, 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'c', 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'id': 1092, 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'id': 1093, 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'id': 1094, 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'id': 1095, 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'f', 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'id': 1096, 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'id': 1097, 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'id': 1098, 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'id': 1099, 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'f', 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'id': 1100, 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'id': 1101, 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'id': 1102, 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'id': 1103, 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'f', 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'id': 1104, 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'f', 'synset': 'top.n.09', 'synonyms': ['cover'], 'id': 1105, 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'id': 1106, 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'id': 1107, 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'synset': 'towel.n.01', 'synonyms': ['towel'], 'id': 1108, 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'id': 1109, 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'synset': 'toy.n.03', 'synonyms': ['toy'], 'id': 1110, 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'id': 1111, 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'id': 1112, 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'c', 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'id': 1113, 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'f', 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'id': 1114, 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'id': 1115, 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'id': 1116, 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'synset': 'tray.n.01', 'synonyms': ['tray'], 'id': 1117, 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'id': 1118, 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'id': 1119, 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'c', 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'id': 1120, 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'f', 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'id': 1121, 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'id': 1122, 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'synset': 'truck.n.01', 'synonyms': ['truck'], 'id': 1123, 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'id': 1124, 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'id': 1125, 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'synset': 'tub.n.02', 'synonyms': ['vat'], 'id': 1126, 'def': 'a large vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'synset': 'turban.n.01', 'synonyms': ['turban'], 'id': 1127, 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'c', 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'id': 1128, 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'id': 1129, 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'id': 1130, 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'c', 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'id': 1131, 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'c', 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'id': 1132, 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'id': 1133, 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'f', 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'id': 1134, 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'id': 1135, 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'f', 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'id': 1136, 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'c', 'synset': 'urn.n.01', 'synonyms': ['urn'], 'id': 1137, 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'id': 1138, 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'f', 'synset': 'vase.n.01', 'synonyms': ['vase'], 'id': 1139, 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'id': 1140, 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'id': 1141, 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'f', 'synset': 'vest.n.01', 'synonyms': ['vest', 'waistcoat'], 'id': 1142, 'def': "a man's sleeveless garment worn underneath a coat", 'name': 'vest'}, {'frequency': 'c', 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'id': 1143, 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'id': 1144, 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'id': 1145, 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'id': 1146, 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'c', 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'id': 1147, 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'id': 1148, 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'id': 1149, 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'id': 1150, 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'id': 1151, 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'id': 1152, 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'id': 1153, 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'id': 1154, 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'id': 1155, 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'f', 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'id': 1156, 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'id': 1157, 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'id': 1158, 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'synset': 'washbasin.n.01', 'synonyms': ['washbasin', 'basin_(for_washing)', 'washbowl', 'washstand', 'handbasin'], 'id': 1159, 'def': 'a bathroom sink that is permanently installed and connected to a water supply and drainpipe; where you can wash your hands and face', 'name': 'washbasin'}, {'frequency': 'c', 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'id': 1160, 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'id': 1161, 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'id': 1162, 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'id': 1163, 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'id': 1164, 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'id': 1165, 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'c', 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'id': 1166, 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'id': 1167, 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'id': 1168, 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'id': 1169, 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'id': 1170, 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'id': 1171, 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'f', 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'id': 1172, 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'id': 1173, 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'id': 1174, 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'id': 1175, 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'id': 1176, 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'id': 1177, 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'id': 1178, 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'id': 1179, 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'id': 1180, 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'c', 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'id': 1181, 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'c', 'synset': 'wig.n.01', 'synonyms': ['wig'], 'id': 1182, 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'id': 1183, 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'id': 1184, 'def': 'A mill or turbine that is powered by wind', 'name': 'windmill'}, {'frequency': 'c', 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'id': 1185, 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'id': 1186, 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'id': 1187, 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'id': 1188, 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'c', 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'id': 1189, 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'id': 1190, 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'f', 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'id': 1191, 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'synset': 'wok.n.01', 'synonyms': ['wok'], 'id': 1192, 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'id': 1193, 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'id': 1194, 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'id': 1195, 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'id': 1196, 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'f', 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'id': 1197, 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'id': 1198, 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'c', 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'id': 1199, 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'c', 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'id': 1200, 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'c', 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'id': 1201, 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'id': 1202, 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'id': 1203, 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}] # noqa
+# fmt: on
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v1_category_image_count.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v1_category_image_count.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd9118ea50d791400c470739d1132c546793b507
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/lvis_v1_category_image_count.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Autogen with
+# with open("lvis_v1_train.json", "r") as f:
+# a = json.load(f)
+# c = a["categories"]
+# for x in c:
+# del x["name"]
+# del x["instance_count"]
+# del x["def"]
+# del x["synonyms"]
+# del x["frequency"]
+# del x["synset"]
+# LVIS_CATEGORY_IMAGE_COUNT = repr(c) + " # noqa"
+# with open("/tmp/lvis_category_image_count.py", "wt") as f:
+# f.write(f"LVIS_CATEGORY_IMAGE_COUNT = {LVIS_CATEGORY_IMAGE_COUNT}")
+# Then paste the contents of that file below
+
+# fmt: off
+LVIS_CATEGORY_IMAGE_COUNT = [{'id': 1, 'image_count': 64}, {'id': 2, 'image_count': 364}, {'id': 3, 'image_count': 1911}, {'id': 4, 'image_count': 149}, {'id': 5, 'image_count': 29}, {'id': 6, 'image_count': 26}, {'id': 7, 'image_count': 59}, {'id': 8, 'image_count': 22}, {'id': 9, 'image_count': 12}, {'id': 10, 'image_count': 28}, {'id': 11, 'image_count': 505}, {'id': 12, 'image_count': 1207}, {'id': 13, 'image_count': 4}, {'id': 14, 'image_count': 10}, {'id': 15, 'image_count': 500}, {'id': 16, 'image_count': 33}, {'id': 17, 'image_count': 3}, {'id': 18, 'image_count': 44}, {'id': 19, 'image_count': 561}, {'id': 20, 'image_count': 8}, {'id': 21, 'image_count': 9}, {'id': 22, 'image_count': 33}, {'id': 23, 'image_count': 1883}, {'id': 24, 'image_count': 98}, {'id': 25, 'image_count': 70}, {'id': 26, 'image_count': 46}, {'id': 27, 'image_count': 117}, {'id': 28, 'image_count': 41}, {'id': 29, 'image_count': 1395}, {'id': 30, 'image_count': 7}, {'id': 31, 'image_count': 1}, {'id': 32, 'image_count': 314}, {'id': 33, 'image_count': 31}, {'id': 34, 'image_count': 1905}, {'id': 35, 'image_count': 1859}, {'id': 36, 'image_count': 1623}, {'id': 37, 'image_count': 47}, {'id': 38, 'image_count': 3}, {'id': 39, 'image_count': 3}, {'id': 40, 'image_count': 1}, {'id': 41, 'image_count': 305}, {'id': 42, 'image_count': 6}, {'id': 43, 'image_count': 210}, {'id': 44, 'image_count': 36}, {'id': 45, 'image_count': 1787}, {'id': 46, 'image_count': 17}, {'id': 47, 'image_count': 51}, {'id': 48, 'image_count': 138}, {'id': 49, 'image_count': 3}, {'id': 50, 'image_count': 1470}, {'id': 51, 'image_count': 3}, {'id': 52, 'image_count': 2}, {'id': 53, 'image_count': 186}, {'id': 54, 'image_count': 76}, {'id': 55, 'image_count': 26}, {'id': 56, 'image_count': 303}, {'id': 57, 'image_count': 738}, {'id': 58, 'image_count': 1799}, {'id': 59, 'image_count': 1934}, {'id': 60, 'image_count': 1609}, {'id': 61, 'image_count': 1622}, {'id': 62, 'image_count': 41}, {'id': 63, 'image_count': 4}, {'id': 64, 'image_count': 11}, {'id': 65, 'image_count': 270}, {'id': 66, 'image_count': 349}, {'id': 67, 'image_count': 42}, {'id': 68, 'image_count': 823}, {'id': 69, 'image_count': 6}, {'id': 70, 'image_count': 48}, {'id': 71, 'image_count': 3}, {'id': 72, 'image_count': 42}, {'id': 73, 'image_count': 24}, {'id': 74, 'image_count': 16}, {'id': 75, 'image_count': 605}, {'id': 76, 'image_count': 646}, {'id': 77, 'image_count': 1765}, {'id': 78, 'image_count': 2}, {'id': 79, 'image_count': 125}, {'id': 80, 'image_count': 1420}, {'id': 81, 'image_count': 140}, {'id': 82, 'image_count': 4}, {'id': 83, 'image_count': 322}, {'id': 84, 'image_count': 60}, {'id': 85, 'image_count': 2}, {'id': 86, 'image_count': 231}, {'id': 87, 'image_count': 333}, {'id': 88, 'image_count': 1941}, {'id': 89, 'image_count': 367}, {'id': 90, 'image_count': 1922}, {'id': 91, 'image_count': 18}, {'id': 92, 'image_count': 81}, {'id': 93, 'image_count': 1}, {'id': 94, 'image_count': 1852}, {'id': 95, 'image_count': 430}, {'id': 96, 'image_count': 247}, {'id': 97, 'image_count': 94}, {'id': 98, 'image_count': 21}, {'id': 99, 'image_count': 1821}, {'id': 100, 'image_count': 16}, {'id': 101, 'image_count': 12}, {'id': 102, 'image_count': 25}, {'id': 103, 'image_count': 41}, {'id': 104, 'image_count': 244}, {'id': 105, 'image_count': 7}, {'id': 106, 'image_count': 1}, {'id': 107, 'image_count': 40}, {'id': 108, 'image_count': 40}, {'id': 109, 'image_count': 104}, {'id': 110, 'image_count': 1671}, {'id': 111, 'image_count': 49}, {'id': 112, 'image_count': 243}, {'id': 113, 'image_count': 2}, {'id': 114, 'image_count': 242}, {'id': 115, 'image_count': 271}, {'id': 116, 'image_count': 104}, {'id': 117, 'image_count': 8}, {'id': 118, 'image_count': 1758}, {'id': 119, 'image_count': 1}, {'id': 120, 'image_count': 48}, {'id': 121, 'image_count': 14}, {'id': 122, 'image_count': 40}, {'id': 123, 'image_count': 1}, {'id': 124, 'image_count': 37}, {'id': 125, 'image_count': 1510}, {'id': 126, 'image_count': 6}, {'id': 127, 'image_count': 1903}, {'id': 128, 'image_count': 70}, {'id': 129, 'image_count': 86}, {'id': 130, 'image_count': 7}, {'id': 131, 'image_count': 5}, {'id': 132, 'image_count': 1406}, {'id': 133, 'image_count': 1901}, {'id': 134, 'image_count': 15}, {'id': 135, 'image_count': 28}, {'id': 136, 'image_count': 6}, {'id': 137, 'image_count': 494}, {'id': 138, 'image_count': 234}, {'id': 139, 'image_count': 1922}, {'id': 140, 'image_count': 1}, {'id': 141, 'image_count': 35}, {'id': 142, 'image_count': 5}, {'id': 143, 'image_count': 1828}, {'id': 144, 'image_count': 8}, {'id': 145, 'image_count': 63}, {'id': 146, 'image_count': 1668}, {'id': 147, 'image_count': 4}, {'id': 148, 'image_count': 95}, {'id': 149, 'image_count': 17}, {'id': 150, 'image_count': 1567}, {'id': 151, 'image_count': 2}, {'id': 152, 'image_count': 103}, {'id': 153, 'image_count': 50}, {'id': 154, 'image_count': 1309}, {'id': 155, 'image_count': 6}, {'id': 156, 'image_count': 92}, {'id': 157, 'image_count': 19}, {'id': 158, 'image_count': 37}, {'id': 159, 'image_count': 4}, {'id': 160, 'image_count': 709}, {'id': 161, 'image_count': 9}, {'id': 162, 'image_count': 82}, {'id': 163, 'image_count': 15}, {'id': 164, 'image_count': 3}, {'id': 165, 'image_count': 61}, {'id': 166, 'image_count': 51}, {'id': 167, 'image_count': 5}, {'id': 168, 'image_count': 13}, {'id': 169, 'image_count': 642}, {'id': 170, 'image_count': 24}, {'id': 171, 'image_count': 255}, {'id': 172, 'image_count': 9}, {'id': 173, 'image_count': 1808}, {'id': 174, 'image_count': 31}, {'id': 175, 'image_count': 158}, {'id': 176, 'image_count': 80}, {'id': 177, 'image_count': 1884}, {'id': 178, 'image_count': 158}, {'id': 179, 'image_count': 2}, {'id': 180, 'image_count': 12}, {'id': 181, 'image_count': 1659}, {'id': 182, 'image_count': 7}, {'id': 183, 'image_count': 834}, {'id': 184, 'image_count': 57}, {'id': 185, 'image_count': 174}, {'id': 186, 'image_count': 95}, {'id': 187, 'image_count': 27}, {'id': 188, 'image_count': 22}, {'id': 189, 'image_count': 1391}, {'id': 190, 'image_count': 90}, {'id': 191, 'image_count': 40}, {'id': 192, 'image_count': 445}, {'id': 193, 'image_count': 21}, {'id': 194, 'image_count': 1132}, {'id': 195, 'image_count': 177}, {'id': 196, 'image_count': 4}, {'id': 197, 'image_count': 17}, {'id': 198, 'image_count': 84}, {'id': 199, 'image_count': 55}, {'id': 200, 'image_count': 30}, {'id': 201, 'image_count': 25}, {'id': 202, 'image_count': 2}, {'id': 203, 'image_count': 125}, {'id': 204, 'image_count': 1135}, {'id': 205, 'image_count': 19}, {'id': 206, 'image_count': 72}, {'id': 207, 'image_count': 1926}, {'id': 208, 'image_count': 159}, {'id': 209, 'image_count': 7}, {'id': 210, 'image_count': 1}, {'id': 211, 'image_count': 13}, {'id': 212, 'image_count': 35}, {'id': 213, 'image_count': 18}, {'id': 214, 'image_count': 8}, {'id': 215, 'image_count': 6}, {'id': 216, 'image_count': 35}, {'id': 217, 'image_count': 1222}, {'id': 218, 'image_count': 103}, {'id': 219, 'image_count': 28}, {'id': 220, 'image_count': 63}, {'id': 221, 'image_count': 28}, {'id': 222, 'image_count': 5}, {'id': 223, 'image_count': 7}, {'id': 224, 'image_count': 14}, {'id': 225, 'image_count': 1918}, {'id': 226, 'image_count': 133}, {'id': 227, 'image_count': 16}, {'id': 228, 'image_count': 27}, {'id': 229, 'image_count': 110}, {'id': 230, 'image_count': 1895}, {'id': 231, 'image_count': 4}, {'id': 232, 'image_count': 1927}, {'id': 233, 'image_count': 8}, {'id': 234, 'image_count': 1}, {'id': 235, 'image_count': 263}, {'id': 236, 'image_count': 10}, {'id': 237, 'image_count': 2}, {'id': 238, 'image_count': 3}, {'id': 239, 'image_count': 87}, {'id': 240, 'image_count': 9}, {'id': 241, 'image_count': 71}, {'id': 242, 'image_count': 13}, {'id': 243, 'image_count': 18}, {'id': 244, 'image_count': 2}, {'id': 245, 'image_count': 5}, {'id': 246, 'image_count': 45}, {'id': 247, 'image_count': 1}, {'id': 248, 'image_count': 23}, {'id': 249, 'image_count': 32}, {'id': 250, 'image_count': 4}, {'id': 251, 'image_count': 1}, {'id': 252, 'image_count': 858}, {'id': 253, 'image_count': 661}, {'id': 254, 'image_count': 168}, {'id': 255, 'image_count': 210}, {'id': 256, 'image_count': 65}, {'id': 257, 'image_count': 4}, {'id': 258, 'image_count': 2}, {'id': 259, 'image_count': 159}, {'id': 260, 'image_count': 31}, {'id': 261, 'image_count': 811}, {'id': 262, 'image_count': 1}, {'id': 263, 'image_count': 42}, {'id': 264, 'image_count': 27}, {'id': 265, 'image_count': 2}, {'id': 266, 'image_count': 5}, {'id': 267, 'image_count': 95}, {'id': 268, 'image_count': 32}, {'id': 269, 'image_count': 1}, {'id': 270, 'image_count': 1}, {'id': 271, 'image_count': 1844}, {'id': 272, 'image_count': 897}, {'id': 273, 'image_count': 31}, {'id': 274, 'image_count': 23}, {'id': 275, 'image_count': 1}, {'id': 276, 'image_count': 202}, {'id': 277, 'image_count': 746}, {'id': 278, 'image_count': 44}, {'id': 279, 'image_count': 14}, {'id': 280, 'image_count': 26}, {'id': 281, 'image_count': 1}, {'id': 282, 'image_count': 2}, {'id': 283, 'image_count': 25}, {'id': 284, 'image_count': 238}, {'id': 285, 'image_count': 592}, {'id': 286, 'image_count': 26}, {'id': 287, 'image_count': 5}, {'id': 288, 'image_count': 42}, {'id': 289, 'image_count': 13}, {'id': 290, 'image_count': 46}, {'id': 291, 'image_count': 1}, {'id': 292, 'image_count': 8}, {'id': 293, 'image_count': 34}, {'id': 294, 'image_count': 5}, {'id': 295, 'image_count': 1}, {'id': 296, 'image_count': 1871}, {'id': 297, 'image_count': 717}, {'id': 298, 'image_count': 1010}, {'id': 299, 'image_count': 679}, {'id': 300, 'image_count': 3}, {'id': 301, 'image_count': 4}, {'id': 302, 'image_count': 1}, {'id': 303, 'image_count': 166}, {'id': 304, 'image_count': 2}, {'id': 305, 'image_count': 266}, {'id': 306, 'image_count': 101}, {'id': 307, 'image_count': 6}, {'id': 308, 'image_count': 14}, {'id': 309, 'image_count': 133}, {'id': 310, 'image_count': 2}, {'id': 311, 'image_count': 38}, {'id': 312, 'image_count': 95}, {'id': 313, 'image_count': 1}, {'id': 314, 'image_count': 12}, {'id': 315, 'image_count': 49}, {'id': 316, 'image_count': 5}, {'id': 317, 'image_count': 5}, {'id': 318, 'image_count': 16}, {'id': 319, 'image_count': 216}, {'id': 320, 'image_count': 12}, {'id': 321, 'image_count': 1}, {'id': 322, 'image_count': 54}, {'id': 323, 'image_count': 5}, {'id': 324, 'image_count': 245}, {'id': 325, 'image_count': 12}, {'id': 326, 'image_count': 7}, {'id': 327, 'image_count': 35}, {'id': 328, 'image_count': 36}, {'id': 329, 'image_count': 32}, {'id': 330, 'image_count': 1027}, {'id': 331, 'image_count': 10}, {'id': 332, 'image_count': 12}, {'id': 333, 'image_count': 1}, {'id': 334, 'image_count': 67}, {'id': 335, 'image_count': 71}, {'id': 336, 'image_count': 30}, {'id': 337, 'image_count': 48}, {'id': 338, 'image_count': 249}, {'id': 339, 'image_count': 13}, {'id': 340, 'image_count': 29}, {'id': 341, 'image_count': 14}, {'id': 342, 'image_count': 236}, {'id': 343, 'image_count': 15}, {'id': 344, 'image_count': 1521}, {'id': 345, 'image_count': 25}, {'id': 346, 'image_count': 249}, {'id': 347, 'image_count': 139}, {'id': 348, 'image_count': 2}, {'id': 349, 'image_count': 2}, {'id': 350, 'image_count': 1890}, {'id': 351, 'image_count': 1240}, {'id': 352, 'image_count': 1}, {'id': 353, 'image_count': 9}, {'id': 354, 'image_count': 1}, {'id': 355, 'image_count': 3}, {'id': 356, 'image_count': 11}, {'id': 357, 'image_count': 4}, {'id': 358, 'image_count': 236}, {'id': 359, 'image_count': 44}, {'id': 360, 'image_count': 19}, {'id': 361, 'image_count': 1100}, {'id': 362, 'image_count': 7}, {'id': 363, 'image_count': 69}, {'id': 364, 'image_count': 2}, {'id': 365, 'image_count': 8}, {'id': 366, 'image_count': 5}, {'id': 367, 'image_count': 227}, {'id': 368, 'image_count': 6}, {'id': 369, 'image_count': 106}, {'id': 370, 'image_count': 81}, {'id': 371, 'image_count': 17}, {'id': 372, 'image_count': 134}, {'id': 373, 'image_count': 312}, {'id': 374, 'image_count': 8}, {'id': 375, 'image_count': 271}, {'id': 376, 'image_count': 2}, {'id': 377, 'image_count': 103}, {'id': 378, 'image_count': 1938}, {'id': 379, 'image_count': 574}, {'id': 380, 'image_count': 120}, {'id': 381, 'image_count': 2}, {'id': 382, 'image_count': 2}, {'id': 383, 'image_count': 13}, {'id': 384, 'image_count': 29}, {'id': 385, 'image_count': 1710}, {'id': 386, 'image_count': 66}, {'id': 387, 'image_count': 1008}, {'id': 388, 'image_count': 1}, {'id': 389, 'image_count': 3}, {'id': 390, 'image_count': 1942}, {'id': 391, 'image_count': 19}, {'id': 392, 'image_count': 1488}, {'id': 393, 'image_count': 46}, {'id': 394, 'image_count': 106}, {'id': 395, 'image_count': 115}, {'id': 396, 'image_count': 19}, {'id': 397, 'image_count': 2}, {'id': 398, 'image_count': 1}, {'id': 399, 'image_count': 28}, {'id': 400, 'image_count': 9}, {'id': 401, 'image_count': 192}, {'id': 402, 'image_count': 12}, {'id': 403, 'image_count': 21}, {'id': 404, 'image_count': 247}, {'id': 405, 'image_count': 6}, {'id': 406, 'image_count': 64}, {'id': 407, 'image_count': 7}, {'id': 408, 'image_count': 40}, {'id': 409, 'image_count': 542}, {'id': 410, 'image_count': 2}, {'id': 411, 'image_count': 1898}, {'id': 412, 'image_count': 36}, {'id': 413, 'image_count': 4}, {'id': 414, 'image_count': 1}, {'id': 415, 'image_count': 191}, {'id': 416, 'image_count': 6}, {'id': 417, 'image_count': 41}, {'id': 418, 'image_count': 39}, {'id': 419, 'image_count': 46}, {'id': 420, 'image_count': 1}, {'id': 421, 'image_count': 1451}, {'id': 422, 'image_count': 1878}, {'id': 423, 'image_count': 11}, {'id': 424, 'image_count': 82}, {'id': 425, 'image_count': 18}, {'id': 426, 'image_count': 1}, {'id': 427, 'image_count': 7}, {'id': 428, 'image_count': 3}, {'id': 429, 'image_count': 575}, {'id': 430, 'image_count': 1907}, {'id': 431, 'image_count': 8}, {'id': 432, 'image_count': 4}, {'id': 433, 'image_count': 32}, {'id': 434, 'image_count': 11}, {'id': 435, 'image_count': 4}, {'id': 436, 'image_count': 54}, {'id': 437, 'image_count': 202}, {'id': 438, 'image_count': 32}, {'id': 439, 'image_count': 3}, {'id': 440, 'image_count': 130}, {'id': 441, 'image_count': 119}, {'id': 442, 'image_count': 141}, {'id': 443, 'image_count': 29}, {'id': 444, 'image_count': 525}, {'id': 445, 'image_count': 1323}, {'id': 446, 'image_count': 2}, {'id': 447, 'image_count': 113}, {'id': 448, 'image_count': 16}, {'id': 449, 'image_count': 7}, {'id': 450, 'image_count': 35}, {'id': 451, 'image_count': 1908}, {'id': 452, 'image_count': 353}, {'id': 453, 'image_count': 18}, {'id': 454, 'image_count': 14}, {'id': 455, 'image_count': 77}, {'id': 456, 'image_count': 8}, {'id': 457, 'image_count': 37}, {'id': 458, 'image_count': 1}, {'id': 459, 'image_count': 346}, {'id': 460, 'image_count': 19}, {'id': 461, 'image_count': 1779}, {'id': 462, 'image_count': 23}, {'id': 463, 'image_count': 25}, {'id': 464, 'image_count': 67}, {'id': 465, 'image_count': 19}, {'id': 466, 'image_count': 28}, {'id': 467, 'image_count': 4}, {'id': 468, 'image_count': 27}, {'id': 469, 'image_count': 1861}, {'id': 470, 'image_count': 11}, {'id': 471, 'image_count': 13}, {'id': 472, 'image_count': 13}, {'id': 473, 'image_count': 32}, {'id': 474, 'image_count': 1767}, {'id': 475, 'image_count': 42}, {'id': 476, 'image_count': 17}, {'id': 477, 'image_count': 128}, {'id': 478, 'image_count': 1}, {'id': 479, 'image_count': 9}, {'id': 480, 'image_count': 10}, {'id': 481, 'image_count': 4}, {'id': 482, 'image_count': 9}, {'id': 483, 'image_count': 18}, {'id': 484, 'image_count': 41}, {'id': 485, 'image_count': 28}, {'id': 486, 'image_count': 3}, {'id': 487, 'image_count': 65}, {'id': 488, 'image_count': 9}, {'id': 489, 'image_count': 23}, {'id': 490, 'image_count': 24}, {'id': 491, 'image_count': 1}, {'id': 492, 'image_count': 2}, {'id': 493, 'image_count': 59}, {'id': 494, 'image_count': 48}, {'id': 495, 'image_count': 17}, {'id': 496, 'image_count': 1877}, {'id': 497, 'image_count': 18}, {'id': 498, 'image_count': 1920}, {'id': 499, 'image_count': 50}, {'id': 500, 'image_count': 1890}, {'id': 501, 'image_count': 99}, {'id': 502, 'image_count': 1530}, {'id': 503, 'image_count': 3}, {'id': 504, 'image_count': 11}, {'id': 505, 'image_count': 19}, {'id': 506, 'image_count': 3}, {'id': 507, 'image_count': 63}, {'id': 508, 'image_count': 5}, {'id': 509, 'image_count': 6}, {'id': 510, 'image_count': 233}, {'id': 511, 'image_count': 54}, {'id': 512, 'image_count': 36}, {'id': 513, 'image_count': 10}, {'id': 514, 'image_count': 124}, {'id': 515, 'image_count': 101}, {'id': 516, 'image_count': 3}, {'id': 517, 'image_count': 363}, {'id': 518, 'image_count': 3}, {'id': 519, 'image_count': 30}, {'id': 520, 'image_count': 18}, {'id': 521, 'image_count': 199}, {'id': 522, 'image_count': 97}, {'id': 523, 'image_count': 32}, {'id': 524, 'image_count': 121}, {'id': 525, 'image_count': 16}, {'id': 526, 'image_count': 12}, {'id': 527, 'image_count': 2}, {'id': 528, 'image_count': 214}, {'id': 529, 'image_count': 48}, {'id': 530, 'image_count': 26}, {'id': 531, 'image_count': 13}, {'id': 532, 'image_count': 4}, {'id': 533, 'image_count': 11}, {'id': 534, 'image_count': 123}, {'id': 535, 'image_count': 7}, {'id': 536, 'image_count': 200}, {'id': 537, 'image_count': 91}, {'id': 538, 'image_count': 9}, {'id': 539, 'image_count': 72}, {'id': 540, 'image_count': 1886}, {'id': 541, 'image_count': 4}, {'id': 542, 'image_count': 1}, {'id': 543, 'image_count': 1}, {'id': 544, 'image_count': 1932}, {'id': 545, 'image_count': 4}, {'id': 546, 'image_count': 56}, {'id': 547, 'image_count': 854}, {'id': 548, 'image_count': 755}, {'id': 549, 'image_count': 1843}, {'id': 550, 'image_count': 96}, {'id': 551, 'image_count': 7}, {'id': 552, 'image_count': 74}, {'id': 553, 'image_count': 66}, {'id': 554, 'image_count': 57}, {'id': 555, 'image_count': 44}, {'id': 556, 'image_count': 1905}, {'id': 557, 'image_count': 4}, {'id': 558, 'image_count': 90}, {'id': 559, 'image_count': 1635}, {'id': 560, 'image_count': 8}, {'id': 561, 'image_count': 5}, {'id': 562, 'image_count': 50}, {'id': 563, 'image_count': 545}, {'id': 564, 'image_count': 20}, {'id': 565, 'image_count': 193}, {'id': 566, 'image_count': 285}, {'id': 567, 'image_count': 3}, {'id': 568, 'image_count': 1}, {'id': 569, 'image_count': 1904}, {'id': 570, 'image_count': 294}, {'id': 571, 'image_count': 3}, {'id': 572, 'image_count': 5}, {'id': 573, 'image_count': 24}, {'id': 574, 'image_count': 2}, {'id': 575, 'image_count': 2}, {'id': 576, 'image_count': 16}, {'id': 577, 'image_count': 8}, {'id': 578, 'image_count': 154}, {'id': 579, 'image_count': 66}, {'id': 580, 'image_count': 1}, {'id': 581, 'image_count': 24}, {'id': 582, 'image_count': 1}, {'id': 583, 'image_count': 4}, {'id': 584, 'image_count': 75}, {'id': 585, 'image_count': 6}, {'id': 586, 'image_count': 126}, {'id': 587, 'image_count': 24}, {'id': 588, 'image_count': 22}, {'id': 589, 'image_count': 1872}, {'id': 590, 'image_count': 16}, {'id': 591, 'image_count': 423}, {'id': 592, 'image_count': 1927}, {'id': 593, 'image_count': 38}, {'id': 594, 'image_count': 3}, {'id': 595, 'image_count': 1945}, {'id': 596, 'image_count': 35}, {'id': 597, 'image_count': 1}, {'id': 598, 'image_count': 13}, {'id': 599, 'image_count': 9}, {'id': 600, 'image_count': 14}, {'id': 601, 'image_count': 37}, {'id': 602, 'image_count': 3}, {'id': 603, 'image_count': 4}, {'id': 604, 'image_count': 100}, {'id': 605, 'image_count': 195}, {'id': 606, 'image_count': 1}, {'id': 607, 'image_count': 12}, {'id': 608, 'image_count': 24}, {'id': 609, 'image_count': 489}, {'id': 610, 'image_count': 10}, {'id': 611, 'image_count': 1689}, {'id': 612, 'image_count': 42}, {'id': 613, 'image_count': 81}, {'id': 614, 'image_count': 894}, {'id': 615, 'image_count': 1868}, {'id': 616, 'image_count': 7}, {'id': 617, 'image_count': 1567}, {'id': 618, 'image_count': 10}, {'id': 619, 'image_count': 8}, {'id': 620, 'image_count': 7}, {'id': 621, 'image_count': 629}, {'id': 622, 'image_count': 89}, {'id': 623, 'image_count': 15}, {'id': 624, 'image_count': 134}, {'id': 625, 'image_count': 4}, {'id': 626, 'image_count': 1802}, {'id': 627, 'image_count': 595}, {'id': 628, 'image_count': 1210}, {'id': 629, 'image_count': 48}, {'id': 630, 'image_count': 418}, {'id': 631, 'image_count': 1846}, {'id': 632, 'image_count': 5}, {'id': 633, 'image_count': 221}, {'id': 634, 'image_count': 10}, {'id': 635, 'image_count': 7}, {'id': 636, 'image_count': 76}, {'id': 637, 'image_count': 22}, {'id': 638, 'image_count': 10}, {'id': 639, 'image_count': 341}, {'id': 640, 'image_count': 1}, {'id': 641, 'image_count': 705}, {'id': 642, 'image_count': 1900}, {'id': 643, 'image_count': 188}, {'id': 644, 'image_count': 227}, {'id': 645, 'image_count': 861}, {'id': 646, 'image_count': 6}, {'id': 647, 'image_count': 115}, {'id': 648, 'image_count': 5}, {'id': 649, 'image_count': 43}, {'id': 650, 'image_count': 14}, {'id': 651, 'image_count': 6}, {'id': 652, 'image_count': 15}, {'id': 653, 'image_count': 1167}, {'id': 654, 'image_count': 15}, {'id': 655, 'image_count': 994}, {'id': 656, 'image_count': 28}, {'id': 657, 'image_count': 2}, {'id': 658, 'image_count': 338}, {'id': 659, 'image_count': 334}, {'id': 660, 'image_count': 15}, {'id': 661, 'image_count': 102}, {'id': 662, 'image_count': 1}, {'id': 663, 'image_count': 8}, {'id': 664, 'image_count': 1}, {'id': 665, 'image_count': 1}, {'id': 666, 'image_count': 28}, {'id': 667, 'image_count': 91}, {'id': 668, 'image_count': 260}, {'id': 669, 'image_count': 131}, {'id': 670, 'image_count': 128}, {'id': 671, 'image_count': 3}, {'id': 672, 'image_count': 10}, {'id': 673, 'image_count': 39}, {'id': 674, 'image_count': 2}, {'id': 675, 'image_count': 925}, {'id': 676, 'image_count': 354}, {'id': 677, 'image_count': 31}, {'id': 678, 'image_count': 10}, {'id': 679, 'image_count': 215}, {'id': 680, 'image_count': 71}, {'id': 681, 'image_count': 43}, {'id': 682, 'image_count': 28}, {'id': 683, 'image_count': 34}, {'id': 684, 'image_count': 16}, {'id': 685, 'image_count': 273}, {'id': 686, 'image_count': 2}, {'id': 687, 'image_count': 999}, {'id': 688, 'image_count': 4}, {'id': 689, 'image_count': 107}, {'id': 690, 'image_count': 2}, {'id': 691, 'image_count': 1}, {'id': 692, 'image_count': 454}, {'id': 693, 'image_count': 9}, {'id': 694, 'image_count': 1901}, {'id': 695, 'image_count': 61}, {'id': 696, 'image_count': 91}, {'id': 697, 'image_count': 46}, {'id': 698, 'image_count': 1402}, {'id': 699, 'image_count': 74}, {'id': 700, 'image_count': 421}, {'id': 701, 'image_count': 226}, {'id': 702, 'image_count': 10}, {'id': 703, 'image_count': 1720}, {'id': 704, 'image_count': 261}, {'id': 705, 'image_count': 1337}, {'id': 706, 'image_count': 293}, {'id': 707, 'image_count': 62}, {'id': 708, 'image_count': 814}, {'id': 709, 'image_count': 407}, {'id': 710, 'image_count': 6}, {'id': 711, 'image_count': 16}, {'id': 712, 'image_count': 7}, {'id': 713, 'image_count': 1791}, {'id': 714, 'image_count': 2}, {'id': 715, 'image_count': 1915}, {'id': 716, 'image_count': 1940}, {'id': 717, 'image_count': 13}, {'id': 718, 'image_count': 16}, {'id': 719, 'image_count': 448}, {'id': 720, 'image_count': 12}, {'id': 721, 'image_count': 18}, {'id': 722, 'image_count': 4}, {'id': 723, 'image_count': 71}, {'id': 724, 'image_count': 189}, {'id': 725, 'image_count': 74}, {'id': 726, 'image_count': 103}, {'id': 727, 'image_count': 3}, {'id': 728, 'image_count': 110}, {'id': 729, 'image_count': 5}, {'id': 730, 'image_count': 9}, {'id': 731, 'image_count': 15}, {'id': 732, 'image_count': 25}, {'id': 733, 'image_count': 7}, {'id': 734, 'image_count': 647}, {'id': 735, 'image_count': 824}, {'id': 736, 'image_count': 100}, {'id': 737, 'image_count': 47}, {'id': 738, 'image_count': 121}, {'id': 739, 'image_count': 731}, {'id': 740, 'image_count': 73}, {'id': 741, 'image_count': 49}, {'id': 742, 'image_count': 23}, {'id': 743, 'image_count': 4}, {'id': 744, 'image_count': 62}, {'id': 745, 'image_count': 118}, {'id': 746, 'image_count': 99}, {'id': 747, 'image_count': 40}, {'id': 748, 'image_count': 1036}, {'id': 749, 'image_count': 105}, {'id': 750, 'image_count': 21}, {'id': 751, 'image_count': 229}, {'id': 752, 'image_count': 7}, {'id': 753, 'image_count': 72}, {'id': 754, 'image_count': 9}, {'id': 755, 'image_count': 10}, {'id': 756, 'image_count': 328}, {'id': 757, 'image_count': 468}, {'id': 758, 'image_count': 1}, {'id': 759, 'image_count': 2}, {'id': 760, 'image_count': 24}, {'id': 761, 'image_count': 11}, {'id': 762, 'image_count': 72}, {'id': 763, 'image_count': 17}, {'id': 764, 'image_count': 10}, {'id': 765, 'image_count': 17}, {'id': 766, 'image_count': 489}, {'id': 767, 'image_count': 47}, {'id': 768, 'image_count': 93}, {'id': 769, 'image_count': 1}, {'id': 770, 'image_count': 12}, {'id': 771, 'image_count': 228}, {'id': 772, 'image_count': 5}, {'id': 773, 'image_count': 76}, {'id': 774, 'image_count': 71}, {'id': 775, 'image_count': 30}, {'id': 776, 'image_count': 109}, {'id': 777, 'image_count': 14}, {'id': 778, 'image_count': 1}, {'id': 779, 'image_count': 8}, {'id': 780, 'image_count': 26}, {'id': 781, 'image_count': 339}, {'id': 782, 'image_count': 153}, {'id': 783, 'image_count': 2}, {'id': 784, 'image_count': 3}, {'id': 785, 'image_count': 8}, {'id': 786, 'image_count': 47}, {'id': 787, 'image_count': 8}, {'id': 788, 'image_count': 6}, {'id': 789, 'image_count': 116}, {'id': 790, 'image_count': 69}, {'id': 791, 'image_count': 13}, {'id': 792, 'image_count': 6}, {'id': 793, 'image_count': 1928}, {'id': 794, 'image_count': 79}, {'id': 795, 'image_count': 14}, {'id': 796, 'image_count': 7}, {'id': 797, 'image_count': 20}, {'id': 798, 'image_count': 114}, {'id': 799, 'image_count': 221}, {'id': 800, 'image_count': 502}, {'id': 801, 'image_count': 62}, {'id': 802, 'image_count': 87}, {'id': 803, 'image_count': 4}, {'id': 804, 'image_count': 1912}, {'id': 805, 'image_count': 7}, {'id': 806, 'image_count': 186}, {'id': 807, 'image_count': 18}, {'id': 808, 'image_count': 4}, {'id': 809, 'image_count': 3}, {'id': 810, 'image_count': 7}, {'id': 811, 'image_count': 1413}, {'id': 812, 'image_count': 7}, {'id': 813, 'image_count': 12}, {'id': 814, 'image_count': 248}, {'id': 815, 'image_count': 4}, {'id': 816, 'image_count': 1881}, {'id': 817, 'image_count': 529}, {'id': 818, 'image_count': 1932}, {'id': 819, 'image_count': 50}, {'id': 820, 'image_count': 3}, {'id': 821, 'image_count': 28}, {'id': 822, 'image_count': 10}, {'id': 823, 'image_count': 5}, {'id': 824, 'image_count': 5}, {'id': 825, 'image_count': 18}, {'id': 826, 'image_count': 14}, {'id': 827, 'image_count': 1890}, {'id': 828, 'image_count': 660}, {'id': 829, 'image_count': 8}, {'id': 830, 'image_count': 25}, {'id': 831, 'image_count': 10}, {'id': 832, 'image_count': 218}, {'id': 833, 'image_count': 36}, {'id': 834, 'image_count': 16}, {'id': 835, 'image_count': 808}, {'id': 836, 'image_count': 479}, {'id': 837, 'image_count': 1404}, {'id': 838, 'image_count': 307}, {'id': 839, 'image_count': 57}, {'id': 840, 'image_count': 28}, {'id': 841, 'image_count': 80}, {'id': 842, 'image_count': 11}, {'id': 843, 'image_count': 92}, {'id': 844, 'image_count': 20}, {'id': 845, 'image_count': 194}, {'id': 846, 'image_count': 23}, {'id': 847, 'image_count': 52}, {'id': 848, 'image_count': 673}, {'id': 849, 'image_count': 2}, {'id': 850, 'image_count': 2}, {'id': 851, 'image_count': 1}, {'id': 852, 'image_count': 2}, {'id': 853, 'image_count': 8}, {'id': 854, 'image_count': 80}, {'id': 855, 'image_count': 3}, {'id': 856, 'image_count': 3}, {'id': 857, 'image_count': 15}, {'id': 858, 'image_count': 2}, {'id': 859, 'image_count': 10}, {'id': 860, 'image_count': 386}, {'id': 861, 'image_count': 65}, {'id': 862, 'image_count': 3}, {'id': 863, 'image_count': 35}, {'id': 864, 'image_count': 5}, {'id': 865, 'image_count': 180}, {'id': 866, 'image_count': 99}, {'id': 867, 'image_count': 49}, {'id': 868, 'image_count': 28}, {'id': 869, 'image_count': 1}, {'id': 870, 'image_count': 52}, {'id': 871, 'image_count': 36}, {'id': 872, 'image_count': 70}, {'id': 873, 'image_count': 6}, {'id': 874, 'image_count': 29}, {'id': 875, 'image_count': 24}, {'id': 876, 'image_count': 1115}, {'id': 877, 'image_count': 61}, {'id': 878, 'image_count': 18}, {'id': 879, 'image_count': 18}, {'id': 880, 'image_count': 665}, {'id': 881, 'image_count': 1096}, {'id': 882, 'image_count': 29}, {'id': 883, 'image_count': 8}, {'id': 884, 'image_count': 14}, {'id': 885, 'image_count': 1622}, {'id': 886, 'image_count': 2}, {'id': 887, 'image_count': 3}, {'id': 888, 'image_count': 32}, {'id': 889, 'image_count': 55}, {'id': 890, 'image_count': 1}, {'id': 891, 'image_count': 10}, {'id': 892, 'image_count': 10}, {'id': 893, 'image_count': 47}, {'id': 894, 'image_count': 3}, {'id': 895, 'image_count': 29}, {'id': 896, 'image_count': 342}, {'id': 897, 'image_count': 25}, {'id': 898, 'image_count': 1469}, {'id': 899, 'image_count': 521}, {'id': 900, 'image_count': 347}, {'id': 901, 'image_count': 35}, {'id': 902, 'image_count': 7}, {'id': 903, 'image_count': 207}, {'id': 904, 'image_count': 108}, {'id': 905, 'image_count': 2}, {'id': 906, 'image_count': 34}, {'id': 907, 'image_count': 12}, {'id': 908, 'image_count': 10}, {'id': 909, 'image_count': 13}, {'id': 910, 'image_count': 361}, {'id': 911, 'image_count': 1023}, {'id': 912, 'image_count': 782}, {'id': 913, 'image_count': 2}, {'id': 914, 'image_count': 5}, {'id': 915, 'image_count': 247}, {'id': 916, 'image_count': 221}, {'id': 917, 'image_count': 4}, {'id': 918, 'image_count': 8}, {'id': 919, 'image_count': 158}, {'id': 920, 'image_count': 3}, {'id': 921, 'image_count': 752}, {'id': 922, 'image_count': 64}, {'id': 923, 'image_count': 707}, {'id': 924, 'image_count': 143}, {'id': 925, 'image_count': 1}, {'id': 926, 'image_count': 49}, {'id': 927, 'image_count': 126}, {'id': 928, 'image_count': 76}, {'id': 929, 'image_count': 11}, {'id': 930, 'image_count': 11}, {'id': 931, 'image_count': 4}, {'id': 932, 'image_count': 39}, {'id': 933, 'image_count': 11}, {'id': 934, 'image_count': 13}, {'id': 935, 'image_count': 91}, {'id': 936, 'image_count': 14}, {'id': 937, 'image_count': 5}, {'id': 938, 'image_count': 3}, {'id': 939, 'image_count': 10}, {'id': 940, 'image_count': 18}, {'id': 941, 'image_count': 9}, {'id': 942, 'image_count': 6}, {'id': 943, 'image_count': 951}, {'id': 944, 'image_count': 2}, {'id': 945, 'image_count': 1}, {'id': 946, 'image_count': 19}, {'id': 947, 'image_count': 1942}, {'id': 948, 'image_count': 1916}, {'id': 949, 'image_count': 139}, {'id': 950, 'image_count': 43}, {'id': 951, 'image_count': 1969}, {'id': 952, 'image_count': 5}, {'id': 953, 'image_count': 134}, {'id': 954, 'image_count': 74}, {'id': 955, 'image_count': 381}, {'id': 956, 'image_count': 1}, {'id': 957, 'image_count': 381}, {'id': 958, 'image_count': 6}, {'id': 959, 'image_count': 1826}, {'id': 960, 'image_count': 28}, {'id': 961, 'image_count': 1635}, {'id': 962, 'image_count': 1967}, {'id': 963, 'image_count': 16}, {'id': 964, 'image_count': 1926}, {'id': 965, 'image_count': 1789}, {'id': 966, 'image_count': 401}, {'id': 967, 'image_count': 1968}, {'id': 968, 'image_count': 1167}, {'id': 969, 'image_count': 1}, {'id': 970, 'image_count': 56}, {'id': 971, 'image_count': 17}, {'id': 972, 'image_count': 1}, {'id': 973, 'image_count': 58}, {'id': 974, 'image_count': 9}, {'id': 975, 'image_count': 8}, {'id': 976, 'image_count': 1124}, {'id': 977, 'image_count': 31}, {'id': 978, 'image_count': 16}, {'id': 979, 'image_count': 491}, {'id': 980, 'image_count': 432}, {'id': 981, 'image_count': 1945}, {'id': 982, 'image_count': 1899}, {'id': 983, 'image_count': 5}, {'id': 984, 'image_count': 28}, {'id': 985, 'image_count': 7}, {'id': 986, 'image_count': 146}, {'id': 987, 'image_count': 1}, {'id': 988, 'image_count': 25}, {'id': 989, 'image_count': 22}, {'id': 990, 'image_count': 1}, {'id': 991, 'image_count': 10}, {'id': 992, 'image_count': 9}, {'id': 993, 'image_count': 308}, {'id': 994, 'image_count': 4}, {'id': 995, 'image_count': 1969}, {'id': 996, 'image_count': 45}, {'id': 997, 'image_count': 12}, {'id': 998, 'image_count': 1}, {'id': 999, 'image_count': 85}, {'id': 1000, 'image_count': 1127}, {'id': 1001, 'image_count': 11}, {'id': 1002, 'image_count': 60}, {'id': 1003, 'image_count': 1}, {'id': 1004, 'image_count': 16}, {'id': 1005, 'image_count': 1}, {'id': 1006, 'image_count': 65}, {'id': 1007, 'image_count': 13}, {'id': 1008, 'image_count': 655}, {'id': 1009, 'image_count': 51}, {'id': 1010, 'image_count': 1}, {'id': 1011, 'image_count': 673}, {'id': 1012, 'image_count': 5}, {'id': 1013, 'image_count': 36}, {'id': 1014, 'image_count': 54}, {'id': 1015, 'image_count': 5}, {'id': 1016, 'image_count': 8}, {'id': 1017, 'image_count': 305}, {'id': 1018, 'image_count': 297}, {'id': 1019, 'image_count': 1053}, {'id': 1020, 'image_count': 223}, {'id': 1021, 'image_count': 1037}, {'id': 1022, 'image_count': 63}, {'id': 1023, 'image_count': 1881}, {'id': 1024, 'image_count': 507}, {'id': 1025, 'image_count': 333}, {'id': 1026, 'image_count': 1911}, {'id': 1027, 'image_count': 1765}, {'id': 1028, 'image_count': 1}, {'id': 1029, 'image_count': 5}, {'id': 1030, 'image_count': 1}, {'id': 1031, 'image_count': 9}, {'id': 1032, 'image_count': 2}, {'id': 1033, 'image_count': 151}, {'id': 1034, 'image_count': 82}, {'id': 1035, 'image_count': 1931}, {'id': 1036, 'image_count': 41}, {'id': 1037, 'image_count': 1895}, {'id': 1038, 'image_count': 24}, {'id': 1039, 'image_count': 22}, {'id': 1040, 'image_count': 35}, {'id': 1041, 'image_count': 69}, {'id': 1042, 'image_count': 962}, {'id': 1043, 'image_count': 588}, {'id': 1044, 'image_count': 21}, {'id': 1045, 'image_count': 825}, {'id': 1046, 'image_count': 52}, {'id': 1047, 'image_count': 5}, {'id': 1048, 'image_count': 5}, {'id': 1049, 'image_count': 5}, {'id': 1050, 'image_count': 1860}, {'id': 1051, 'image_count': 56}, {'id': 1052, 'image_count': 1582}, {'id': 1053, 'image_count': 7}, {'id': 1054, 'image_count': 2}, {'id': 1055, 'image_count': 1562}, {'id': 1056, 'image_count': 1885}, {'id': 1057, 'image_count': 1}, {'id': 1058, 'image_count': 5}, {'id': 1059, 'image_count': 137}, {'id': 1060, 'image_count': 1094}, {'id': 1061, 'image_count': 134}, {'id': 1062, 'image_count': 29}, {'id': 1063, 'image_count': 22}, {'id': 1064, 'image_count': 522}, {'id': 1065, 'image_count': 50}, {'id': 1066, 'image_count': 68}, {'id': 1067, 'image_count': 16}, {'id': 1068, 'image_count': 40}, {'id': 1069, 'image_count': 35}, {'id': 1070, 'image_count': 135}, {'id': 1071, 'image_count': 1413}, {'id': 1072, 'image_count': 772}, {'id': 1073, 'image_count': 50}, {'id': 1074, 'image_count': 1015}, {'id': 1075, 'image_count': 1}, {'id': 1076, 'image_count': 65}, {'id': 1077, 'image_count': 1900}, {'id': 1078, 'image_count': 1302}, {'id': 1079, 'image_count': 1977}, {'id': 1080, 'image_count': 2}, {'id': 1081, 'image_count': 29}, {'id': 1082, 'image_count': 36}, {'id': 1083, 'image_count': 138}, {'id': 1084, 'image_count': 4}, {'id': 1085, 'image_count': 67}, {'id': 1086, 'image_count': 26}, {'id': 1087, 'image_count': 25}, {'id': 1088, 'image_count': 33}, {'id': 1089, 'image_count': 37}, {'id': 1090, 'image_count': 50}, {'id': 1091, 'image_count': 270}, {'id': 1092, 'image_count': 12}, {'id': 1093, 'image_count': 316}, {'id': 1094, 'image_count': 41}, {'id': 1095, 'image_count': 224}, {'id': 1096, 'image_count': 105}, {'id': 1097, 'image_count': 1925}, {'id': 1098, 'image_count': 1021}, {'id': 1099, 'image_count': 1213}, {'id': 1100, 'image_count': 172}, {'id': 1101, 'image_count': 28}, {'id': 1102, 'image_count': 745}, {'id': 1103, 'image_count': 187}, {'id': 1104, 'image_count': 147}, {'id': 1105, 'image_count': 136}, {'id': 1106, 'image_count': 34}, {'id': 1107, 'image_count': 41}, {'id': 1108, 'image_count': 636}, {'id': 1109, 'image_count': 570}, {'id': 1110, 'image_count': 1149}, {'id': 1111, 'image_count': 61}, {'id': 1112, 'image_count': 1890}, {'id': 1113, 'image_count': 18}, {'id': 1114, 'image_count': 143}, {'id': 1115, 'image_count': 1517}, {'id': 1116, 'image_count': 7}, {'id': 1117, 'image_count': 943}, {'id': 1118, 'image_count': 6}, {'id': 1119, 'image_count': 1}, {'id': 1120, 'image_count': 11}, {'id': 1121, 'image_count': 101}, {'id': 1122, 'image_count': 1909}, {'id': 1123, 'image_count': 800}, {'id': 1124, 'image_count': 1}, {'id': 1125, 'image_count': 44}, {'id': 1126, 'image_count': 3}, {'id': 1127, 'image_count': 44}, {'id': 1128, 'image_count': 31}, {'id': 1129, 'image_count': 7}, {'id': 1130, 'image_count': 20}, {'id': 1131, 'image_count': 11}, {'id': 1132, 'image_count': 13}, {'id': 1133, 'image_count': 1924}, {'id': 1134, 'image_count': 113}, {'id': 1135, 'image_count': 2}, {'id': 1136, 'image_count': 139}, {'id': 1137, 'image_count': 12}, {'id': 1138, 'image_count': 37}, {'id': 1139, 'image_count': 1866}, {'id': 1140, 'image_count': 47}, {'id': 1141, 'image_count': 1468}, {'id': 1142, 'image_count': 729}, {'id': 1143, 'image_count': 24}, {'id': 1144, 'image_count': 1}, {'id': 1145, 'image_count': 10}, {'id': 1146, 'image_count': 3}, {'id': 1147, 'image_count': 14}, {'id': 1148, 'image_count': 4}, {'id': 1149, 'image_count': 29}, {'id': 1150, 'image_count': 4}, {'id': 1151, 'image_count': 70}, {'id': 1152, 'image_count': 46}, {'id': 1153, 'image_count': 14}, {'id': 1154, 'image_count': 48}, {'id': 1155, 'image_count': 1855}, {'id': 1156, 'image_count': 113}, {'id': 1157, 'image_count': 1}, {'id': 1158, 'image_count': 1}, {'id': 1159, 'image_count': 10}, {'id': 1160, 'image_count': 54}, {'id': 1161, 'image_count': 1923}, {'id': 1162, 'image_count': 630}, {'id': 1163, 'image_count': 31}, {'id': 1164, 'image_count': 69}, {'id': 1165, 'image_count': 7}, {'id': 1166, 'image_count': 11}, {'id': 1167, 'image_count': 1}, {'id': 1168, 'image_count': 30}, {'id': 1169, 'image_count': 50}, {'id': 1170, 'image_count': 45}, {'id': 1171, 'image_count': 28}, {'id': 1172, 'image_count': 114}, {'id': 1173, 'image_count': 193}, {'id': 1174, 'image_count': 21}, {'id': 1175, 'image_count': 91}, {'id': 1176, 'image_count': 31}, {'id': 1177, 'image_count': 1469}, {'id': 1178, 'image_count': 1924}, {'id': 1179, 'image_count': 87}, {'id': 1180, 'image_count': 77}, {'id': 1181, 'image_count': 11}, {'id': 1182, 'image_count': 47}, {'id': 1183, 'image_count': 21}, {'id': 1184, 'image_count': 47}, {'id': 1185, 'image_count': 70}, {'id': 1186, 'image_count': 1838}, {'id': 1187, 'image_count': 19}, {'id': 1188, 'image_count': 531}, {'id': 1189, 'image_count': 11}, {'id': 1190, 'image_count': 941}, {'id': 1191, 'image_count': 113}, {'id': 1192, 'image_count': 26}, {'id': 1193, 'image_count': 5}, {'id': 1194, 'image_count': 56}, {'id': 1195, 'image_count': 73}, {'id': 1196, 'image_count': 32}, {'id': 1197, 'image_count': 128}, {'id': 1198, 'image_count': 623}, {'id': 1199, 'image_count': 12}, {'id': 1200, 'image_count': 52}, {'id': 1201, 'image_count': 11}, {'id': 1202, 'image_count': 1674}, {'id': 1203, 'image_count': 81}] # noqa
+# fmt: on
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/pascal_voc.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/pascal_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae65032366034bd98c1605094d2adcd8619aace2
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/pascal_voc.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import numpy as np
+import os
+import xml.etree.ElementTree as ET
+from typing import List, Tuple, Union
+
+from custom_detectron2.data import DatasetCatalog, MetadataCatalog
+from custom_detectron2.structures import BoxMode
+from custom_detectron2.utils.file_io import PathManager
+
+__all__ = ["load_voc_instances", "register_pascal_voc"]
+
+
+# fmt: off
+CLASS_NAMES = (
+ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+ "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+ "pottedplant", "sheep", "sofa", "train", "tvmonitor"
+)
+# fmt: on
+
+
+def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
+ """
+ Load Pascal VOC detection annotations to Detectron2 format.
+
+ Args:
+ dirname: Contain "Annotations", "ImageSets", "JPEGImages"
+ split (str): one of "train", "test", "val", "trainval"
+ class_names: list or tuple of class names
+ """
+ with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f:
+ fileids = np.loadtxt(f, dtype=np.str)
+
+ # Needs to read many small annotation files. Makes sense at local
+ annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
+ dicts = []
+ for fileid in fileids:
+ anno_file = os.path.join(annotation_dirname, fileid + ".xml")
+ jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
+
+ with PathManager.open(anno_file) as f:
+ tree = ET.parse(f)
+
+ r = {
+ "file_name": jpeg_file,
+ "image_id": fileid,
+ "height": int(tree.findall("./size/height")[0].text),
+ "width": int(tree.findall("./size/width")[0].text),
+ }
+ instances = []
+
+ for obj in tree.findall("object"):
+ cls = obj.find("name").text
+ # We include "difficult" samples in training.
+ # Based on limited experiments, they don't hurt accuracy.
+ # difficult = int(obj.find("difficult").text)
+ # if difficult == 1:
+ # continue
+ bbox = obj.find("bndbox")
+ bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
+ # Original annotations are integers in the range [1, W or H]
+ # Assuming they mean 1-based pixel indices (inclusive),
+ # a box with annotation (xmin=1, xmax=W) covers the whole image.
+ # In coordinate space this is represented by (xmin=0, xmax=W)
+ bbox[0] -= 1.0
+ bbox[1] -= 1.0
+ instances.append(
+ {"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS}
+ )
+ r["annotations"] = instances
+ dicts.append(r)
+ return dicts
+
+
+def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES):
+ DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names))
+ MetadataCatalog.get(name).set(
+ thing_classes=list(class_names), dirname=dirname, year=year, split=split
+ )
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/register_coco.py b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/register_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b98427085f530d6dd8779b232a2b5764ae01aa7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/datasets/register_coco.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .coco import register_coco_instances # noqa
+from .coco_panoptic import register_coco_panoptic_separated # noqa
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/detection_utils.py b/comfyui_controlnet_aux/src/custom_detectron2/data/detection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bcf36cf59544031b3fe8f28dada8261daf6e9a1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/detection_utils.py
@@ -0,0 +1,659 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+Common data processing utilities that are used in a
+typical object detection data pipeline.
+"""
+import logging
+import numpy as np
+from typing import List, Union
+import custom_pycocotools.mask as mask_util
+import torch
+from PIL import Image
+
+from custom_detectron2.structures import (
+ BitMasks,
+ Boxes,
+ BoxMode,
+ Instances,
+ Keypoints,
+ PolygonMasks,
+ RotatedBoxes,
+ polygons_to_bitmask,
+)
+from custom_detectron2.utils.file_io import PathManager
+
+from . import transforms as T
+from .catalog import MetadataCatalog
+
+__all__ = [
+ "SizeMismatchError",
+ "convert_image_to_rgb",
+ "check_image_size",
+ "transform_proposals",
+ "transform_instance_annotations",
+ "annotations_to_instances",
+ "annotations_to_instances_rotated",
+ "build_augmentation",
+ "build_transform_gen",
+ "create_keypoint_hflip_indices",
+ "filter_empty_instances",
+ "read_image",
+]
+
+
+class SizeMismatchError(ValueError):
+ """
+ When loaded image has difference width/height compared with annotation.
+ """
+
+
+# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
+_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
+_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
+
+# https://www.exiv2.org/tags.html
+_EXIF_ORIENT = 274 # exif 'Orientation' tag
+
+
+def convert_PIL_to_numpy(image, format):
+ """
+ Convert PIL image to numpy array of target format.
+
+ Args:
+ image (PIL.Image): a PIL image
+ format (str): the format of output image
+
+ Returns:
+ (np.ndarray): also see `read_image`
+ """
+ if format is not None:
+ # PIL only supports RGB, so convert to RGB and flip channels over below
+ conversion_format = format
+ if format in ["BGR", "YUV-BT.601"]:
+ conversion_format = "RGB"
+ image = image.convert(conversion_format)
+ image = np.asarray(image)
+ # PIL squeezes out the channel dimension for "L", so make it HWC
+ if format == "L":
+ image = np.expand_dims(image, -1)
+
+ # handle formats not supported by PIL
+ elif format == "BGR":
+ # flip channels if needed
+ image = image[:, :, ::-1]
+ elif format == "YUV-BT.601":
+ image = image / 255.0
+ image = np.dot(image, np.array(_M_RGB2YUV).T)
+
+ return image
+
+
+def convert_image_to_rgb(image, format):
+ """
+ Convert an image from given format to RGB.
+
+ Args:
+ image (np.ndarray or Tensor): an HWC image
+ format (str): the format of input image, also see `read_image`
+
+ Returns:
+ (np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8
+ """
+ if isinstance(image, torch.Tensor):
+ image = image.cpu().numpy()
+ if format == "BGR":
+ image = image[:, :, [2, 1, 0]]
+ elif format == "YUV-BT.601":
+ image = np.dot(image, np.array(_M_YUV2RGB).T)
+ image = image * 255.0
+ else:
+ if format == "L":
+ image = image[:, :, 0]
+ image = image.astype(np.uint8)
+ image = np.asarray(Image.fromarray(image, mode=format).convert("RGB"))
+ return image
+
+
+def _apply_exif_orientation(image):
+ """
+ Applies the exif orientation correctly.
+
+ This code exists per the bug:
+ https://github.com/python-pillow/Pillow/issues/3973
+ with the function `ImageOps.exif_transpose`. The Pillow source raises errors with
+ various methods, especially `tobytes`
+
+ Function based on:
+ https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59
+ https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527
+
+ Args:
+ image (PIL.Image): a PIL image
+
+ Returns:
+ (PIL.Image): the PIL image with exif orientation applied, if applicable
+ """
+ if not hasattr(image, "getexif"):
+ return image
+
+ try:
+ exif = image.getexif()
+ except Exception: # https://github.com/facebookresearch/detectron2/issues/1885
+ exif = None
+
+ if exif is None:
+ return image
+
+ orientation = exif.get(_EXIF_ORIENT)
+
+ method = {
+ 2: Image.FLIP_LEFT_RIGHT,
+ 3: Image.ROTATE_180,
+ 4: Image.FLIP_TOP_BOTTOM,
+ 5: Image.TRANSPOSE,
+ 6: Image.ROTATE_270,
+ 7: Image.TRANSVERSE,
+ 8: Image.ROTATE_90,
+ }.get(orientation)
+
+ if method is not None:
+ return image.transpose(method)
+ return image
+
+
+def read_image(file_name, format=None):
+ """
+ Read an image into the given format.
+ Will apply rotation and flipping if the image has such exif information.
+
+ Args:
+ file_name (str): image file path
+ format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601".
+
+ Returns:
+ image (np.ndarray):
+ an HWC image in the given format, which is 0-255, uint8 for
+ supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601.
+ """
+ with PathManager.open(file_name, "rb") as f:
+ image = Image.open(f)
+
+ # work around this bug: https://github.com/python-pillow/Pillow/issues/3973
+ image = _apply_exif_orientation(image)
+ return convert_PIL_to_numpy(image, format)
+
+
+def check_image_size(dataset_dict, image):
+ """
+ Raise an error if the image does not match the size specified in the dict.
+ """
+ if "width" in dataset_dict or "height" in dataset_dict:
+ image_wh = (image.shape[1], image.shape[0])
+ expected_wh = (dataset_dict["width"], dataset_dict["height"])
+ if not image_wh == expected_wh:
+ raise SizeMismatchError(
+ "Mismatched image shape{}, got {}, expect {}.".format(
+ " for image " + dataset_dict["file_name"]
+ if "file_name" in dataset_dict
+ else "",
+ image_wh,
+ expected_wh,
+ )
+ + " Please check the width/height in your annotation."
+ )
+
+ # To ensure bbox always remap to original image size
+ if "width" not in dataset_dict:
+ dataset_dict["width"] = image.shape[1]
+ if "height" not in dataset_dict:
+ dataset_dict["height"] = image.shape[0]
+
+
+def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0):
+ """
+ Apply transformations to the proposals in dataset_dict, if any.
+
+ Args:
+ dataset_dict (dict): a dict read from the dataset, possibly
+ contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
+ image_shape (tuple): height, width
+ transforms (TransformList):
+ proposal_topk (int): only keep top-K scoring proposals
+ min_box_size (int): proposals with either side smaller than this
+ threshold are removed
+
+ The input dict is modified in-place, with abovementioned keys removed. A new
+ key "proposals" will be added. Its value is an `Instances`
+ object which contains the transformed proposals in its field
+ "proposal_boxes" and "objectness_logits".
+ """
+ if "proposal_boxes" in dataset_dict:
+ # Transform proposal boxes
+ boxes = transforms.apply_box(
+ BoxMode.convert(
+ dataset_dict.pop("proposal_boxes"),
+ dataset_dict.pop("proposal_bbox_mode"),
+ BoxMode.XYXY_ABS,
+ )
+ )
+ boxes = Boxes(boxes)
+ objectness_logits = torch.as_tensor(
+ dataset_dict.pop("proposal_objectness_logits").astype("float32")
+ )
+
+ boxes.clip(image_shape)
+ keep = boxes.nonempty(threshold=min_box_size)
+ boxes = boxes[keep]
+ objectness_logits = objectness_logits[keep]
+
+ proposals = Instances(image_shape)
+ proposals.proposal_boxes = boxes[:proposal_topk]
+ proposals.objectness_logits = objectness_logits[:proposal_topk]
+ dataset_dict["proposals"] = proposals
+
+
+def get_bbox(annotation):
+ """
+ Get bbox from data
+ Args:
+ annotation (dict): dict of instance annotations for a single instance.
+ Returns:
+ bbox (ndarray): x1, y1, x2, y2 coordinates
+ """
+ # bbox is 1d (per-instance bounding box)
+ bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+ return bbox
+
+
+def transform_instance_annotations(
+ annotation, transforms, image_size, *, keypoint_hflip_indices=None
+):
+ """
+ Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+ It will use `transforms.apply_box` for the box, and
+ `transforms.apply_coords` for segmentation polygons & keypoints.
+ If you need anything more specially designed for each data structure,
+ you'll need to implement your own version of this function or the transforms.
+
+ Args:
+ annotation (dict): dict of instance annotations for a single instance.
+ It will be modified in-place.
+ transforms (TransformList or list[Transform]):
+ image_size (tuple): the height, width of the transformed image
+ keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+ Returns:
+ dict:
+ the same input dict with fields "bbox", "segmentation", "keypoints"
+ transformed according to `transforms`.
+ The "bbox_mode" field will be set to XYXY_ABS.
+ """
+ if isinstance(transforms, (tuple, list)):
+ transforms = T.TransformList(transforms)
+ # bbox is 1d (per-instance bounding box)
+ bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+ # clip transformed bbox to image size
+ bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0)
+ annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
+ annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+ if "segmentation" in annotation:
+ # each instance contains 1 or more polygons
+ segm = annotation["segmentation"]
+ if isinstance(segm, list):
+ # polygons
+ polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
+ annotation["segmentation"] = [
+ p.reshape(-1) for p in transforms.apply_polygons(polygons)
+ ]
+ elif isinstance(segm, dict):
+ # RLE
+ mask = mask_util.decode(segm)
+ mask = transforms.apply_segmentation(mask)
+ assert tuple(mask.shape[:2]) == image_size
+ annotation["segmentation"] = mask
+ else:
+ raise ValueError(
+ "Cannot transform segmentation of type '{}'!"
+ "Supported types are: polygons as list[list[float] or ndarray],"
+ " COCO-style RLE as a dict.".format(type(segm))
+ )
+
+ if "keypoints" in annotation:
+ keypoints = transform_keypoint_annotations(
+ annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
+ )
+ annotation["keypoints"] = keypoints
+
+ return annotation
+
+
+def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
+ """
+ Transform keypoint annotations of an image.
+ If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0)
+
+ Args:
+ keypoints (list[float]): Nx3 float in Detectron2's Dataset format.
+ Each point is represented by (x, y, visibility).
+ transforms (TransformList):
+ image_size (tuple): the height, width of the transformed image
+ keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+ When `transforms` includes horizontal flip, will use the index
+ mapping to flip keypoints.
+ """
+ # (N*3,) -> (N, 3)
+ keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
+ keypoints_xy = transforms.apply_coords(keypoints[:, :2])
+
+ # Set all out-of-boundary points to "unlabeled"
+ inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1]))
+ inside = inside.all(axis=1)
+ keypoints[:, :2] = keypoints_xy
+ keypoints[:, 2][~inside] = 0
+
+ # This assumes that HorizFlipTransform is the only one that does flip
+ do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+
+ # Alternative way: check if probe points was horizontally flipped.
+ # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
+ # probe_aug = transforms.apply_coords(probe.copy())
+ # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0]) # noqa
+
+ # If flipped, swap each keypoint with its opposite-handed equivalent
+ if do_hflip:
+ if keypoint_hflip_indices is None:
+ raise ValueError("Cannot flip keypoints without providing flip indices!")
+ if len(keypoints) != len(keypoint_hflip_indices):
+ raise ValueError(
+ "Keypoint data has {} points, but metadata "
+ "contains {} points!".format(len(keypoints), len(keypoint_hflip_indices))
+ )
+ keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :]
+
+ # Maintain COCO convention that if visibility == 0 (unlabeled), then x, y = 0
+ keypoints[keypoints[:, 2] == 0] = 0
+ return keypoints
+
+
+def annotations_to_instances(annos, image_size, mask_format="polygon"):
+ """
+ Create an :class:`Instances` object used by the models,
+ from instance annotations in the dataset dict.
+
+ Args:
+ annos (list[dict]): a list of instance annotations in one image, each
+ element for one instance.
+ image_size (tuple): height, width
+
+ Returns:
+ Instances:
+ It will contain fields "gt_boxes", "gt_classes",
+ "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+ This is the format that builtin models expect.
+ """
+ boxes = (
+ np.stack(
+ [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+ )
+ if len(annos)
+ else np.zeros((0, 4))
+ )
+ target = Instances(image_size)
+ target.gt_boxes = Boxes(boxes)
+
+ classes = [int(obj["category_id"]) for obj in annos]
+ classes = torch.tensor(classes, dtype=torch.int64)
+ target.gt_classes = classes
+
+ if len(annos) and "segmentation" in annos[0]:
+ segms = [obj["segmentation"] for obj in annos]
+ if mask_format == "polygon":
+ try:
+ masks = PolygonMasks(segms)
+ except ValueError as e:
+ raise ValueError(
+ "Failed to use mask_format=='polygon' from the given annotations!"
+ ) from e
+ else:
+ assert mask_format == "bitmask", mask_format
+ masks = []
+ for segm in segms:
+ if isinstance(segm, list):
+ # polygon
+ masks.append(polygons_to_bitmask(segm, *image_size))
+ elif isinstance(segm, dict):
+ # COCO RLE
+ masks.append(mask_util.decode(segm))
+ elif isinstance(segm, np.ndarray):
+ assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+ segm.ndim
+ )
+ # mask array
+ masks.append(segm)
+ else:
+ raise ValueError(
+ "Cannot convert segmentation of type '{}' to BitMasks!"
+ "Supported types are: polygons as list[list[float] or ndarray],"
+ " COCO-style RLE as a dict, or a binary segmentation mask "
+ " in a 2D numpy array of shape HxW.".format(type(segm))
+ )
+ # torch.from_numpy does not support array with negative stride.
+ masks = BitMasks(
+ torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
+ )
+ target.gt_masks = masks
+
+ if len(annos) and "keypoints" in annos[0]:
+ kpts = [obj.get("keypoints", []) for obj in annos]
+ target.gt_keypoints = Keypoints(kpts)
+
+ return target
+
+
+def annotations_to_instances_rotated(annos, image_size):
+ """
+ Create an :class:`Instances` object used by the models,
+ from instance annotations in the dataset dict.
+ Compared to `annotations_to_instances`, this function is for rotated boxes only
+
+ Args:
+ annos (list[dict]): a list of instance annotations in one image, each
+ element for one instance.
+ image_size (tuple): height, width
+
+ Returns:
+ Instances:
+ Containing fields "gt_boxes", "gt_classes",
+ if they can be obtained from `annos`.
+ This is the format that builtin models expect.
+ """
+ boxes = [obj["bbox"] for obj in annos]
+ target = Instances(image_size)
+ boxes = target.gt_boxes = RotatedBoxes(boxes)
+ boxes.clip(image_size)
+
+ classes = [obj["category_id"] for obj in annos]
+ classes = torch.tensor(classes, dtype=torch.int64)
+ target.gt_classes = classes
+
+ return target
+
+
+def filter_empty_instances(
+ instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False
+):
+ """
+ Filter out empty instances in an `Instances` object.
+
+ Args:
+ instances (Instances):
+ by_box (bool): whether to filter out instances with empty boxes
+ by_mask (bool): whether to filter out instances with empty masks
+ box_threshold (float): minimum width and height to be considered non-empty
+ return_mask (bool): whether to return boolean mask of filtered instances
+
+ Returns:
+ Instances: the filtered instances.
+ tensor[bool], optional: boolean mask of filtered instances
+ """
+ assert by_box or by_mask
+ r = []
+ if by_box:
+ r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
+ if instances.has("gt_masks") and by_mask:
+ r.append(instances.gt_masks.nonempty())
+
+ # TODO: can also filter visible keypoints
+
+ if not r:
+ return instances
+ m = r[0]
+ for x in r[1:]:
+ m = m & x
+ if return_mask:
+ return instances[m], m
+ return instances[m]
+
+
+def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]:
+ """
+ Args:
+ dataset_names: list of dataset names
+
+ Returns:
+ list[int]: a list of size=#keypoints, storing the
+ horizontally-flipped keypoint indices.
+ """
+ if isinstance(dataset_names, str):
+ dataset_names = [dataset_names]
+
+ check_metadata_consistency("keypoint_names", dataset_names)
+ check_metadata_consistency("keypoint_flip_map", dataset_names)
+
+ meta = MetadataCatalog.get(dataset_names[0])
+ names = meta.keypoint_names
+ # TODO flip -> hflip
+ flip_map = dict(meta.keypoint_flip_map)
+ flip_map.update({v: k for k, v in flip_map.items()})
+ flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
+ flip_indices = [names.index(i) for i in flipped_names]
+ return flip_indices
+
+
+def get_fed_loss_cls_weights(dataset_names: Union[str, List[str]], freq_weight_power=1.0):
+ """
+ Get frequency weight for each class sorted by class id.
+ We now calcualte freqency weight using image_count to the power freq_weight_power.
+
+ Args:
+ dataset_names: list of dataset names
+ freq_weight_power: power value
+ """
+ if isinstance(dataset_names, str):
+ dataset_names = [dataset_names]
+
+ check_metadata_consistency("class_image_count", dataset_names)
+
+ meta = MetadataCatalog.get(dataset_names[0])
+ class_freq_meta = meta.class_image_count
+ class_freq = torch.tensor(
+ [c["image_count"] for c in sorted(class_freq_meta, key=lambda x: x["id"])]
+ )
+ class_freq_weight = class_freq.float() ** freq_weight_power
+ return class_freq_weight
+
+
+def gen_crop_transform_with_instance(crop_size, image_size, instance):
+ """
+ Generate a CropTransform so that the cropping region contains
+ the center of the given instance.
+
+ Args:
+ crop_size (tuple): h, w in pixels
+ image_size (tuple): h, w
+ instance (dict): an annotation dict of one instance, in Detectron2's
+ dataset format.
+ """
+ crop_size = np.asarray(crop_size, dtype=np.int32)
+ bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
+ center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
+ assert (
+ image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
+ ), "The annotation bounding box is outside of the image!"
+ assert (
+ image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
+ ), "Crop size is larger than image size!"
+
+ min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
+ max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
+ max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
+
+ y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
+ x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
+ return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
+
+
+def check_metadata_consistency(key, dataset_names):
+ """
+ Check that the datasets have consistent metadata.
+
+ Args:
+ key (str): a metadata key
+ dataset_names (list[str]): a list of dataset names
+
+ Raises:
+ AttributeError: if the key does not exist in the metadata
+ ValueError: if the given datasets do not have the same metadata values defined by key
+ """
+ if len(dataset_names) == 0:
+ return
+ logger = logging.getLogger(__name__)
+ entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
+ for idx, entry in enumerate(entries_per_dataset):
+ if entry != entries_per_dataset[0]:
+ logger.error(
+ "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
+ )
+ logger.error(
+ "Metadata '{}' for dataset '{}' is '{}'".format(
+ key, dataset_names[0], str(entries_per_dataset[0])
+ )
+ )
+ raise ValueError("Datasets have different metadata '{}'!".format(key))
+
+
+def build_augmentation(cfg, is_train):
+ """
+ Create a list of default :class:`Augmentation` from config.
+ Now it includes resizing and flipping.
+
+ Returns:
+ list[Augmentation]
+ """
+ if is_train:
+ min_size = cfg.INPUT.MIN_SIZE_TRAIN
+ max_size = cfg.INPUT.MAX_SIZE_TRAIN
+ sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+ else:
+ min_size = cfg.INPUT.MIN_SIZE_TEST
+ max_size = cfg.INPUT.MAX_SIZE_TEST
+ sample_style = "choice"
+ augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
+ if is_train and cfg.INPUT.RANDOM_FLIP != "none":
+ augmentation.append(
+ T.RandomFlip(
+ horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
+ vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+ )
+ )
+ return augmentation
+
+
+build_transform_gen = build_augmentation
+"""
+Alias for backward-compatibility.
+"""
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59403e643c8f5b23eba05d71e794cb15d3dafe90
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .distributed_sampler import (
+ InferenceSampler,
+ RandomSubsetTrainingSampler,
+ RepeatFactorTrainingSampler,
+ TrainingSampler,
+)
+
+from .grouped_batch_sampler import GroupedBatchSampler
+
+__all__ = [
+ "GroupedBatchSampler",
+ "TrainingSampler",
+ "RandomSubsetTrainingSampler",
+ "InferenceSampler",
+ "RepeatFactorTrainingSampler",
+]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/distributed_sampler.py b/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d2aa9ec04ab85408d8921dcf49491739f26578
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/distributed_sampler.py
@@ -0,0 +1,278 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import math
+from collections import defaultdict
+from typing import Optional
+import torch
+from torch.utils.data.sampler import Sampler
+
+from custom_detectron2.utils import comm
+
+logger = logging.getLogger(__name__)
+
+
+class TrainingSampler(Sampler):
+ """
+ In training, we only care about the "infinite stream" of training data.
+ So this sampler produces an infinite stream of indices and
+ all workers cooperate to correctly shuffle the indices and sample different indices.
+
+ The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+ where `indices` is an infinite stream of indices consisting of
+ `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+ or `range(size) + range(size) + ...` (if shuffle is False)
+
+ Note that this sampler does not shard based on pytorch DataLoader worker id.
+ A sampler passed to pytorch DataLoader is used only with map-style dataset
+ and will not be executed inside workers.
+ But if this sampler is used in a way that it gets execute inside a dataloader
+ worker, then extra work needs to be done to shard its outputs based on worker id.
+ This is required so that workers don't produce identical data.
+ :class:`ToIterableDataset` implements this logic.
+ This note is true for all samplers in detectron2.
+ """
+
+ def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None):
+ """
+ Args:
+ size (int): the total number of data of the underlying dataset to sample from
+ shuffle (bool): whether to shuffle the indices or not
+ seed (int): the initial seed of the shuffle. Must be the same
+ across all workers. If None, will use a random seed shared
+ among workers (require synchronization among all workers).
+ """
+ if not isinstance(size, int):
+ raise TypeError(f"TrainingSampler(size=) expects an int. Got type {type(size)}.")
+ if size <= 0:
+ raise ValueError(f"TrainingSampler(size=) expects a positive int. Got {size}.")
+ self._size = size
+ self._shuffle = shuffle
+ if seed is None:
+ seed = comm.shared_random_seed()
+ self._seed = int(seed)
+
+ self._rank = comm.get_rank()
+ self._world_size = comm.get_world_size()
+
+ def __iter__(self):
+ start = self._rank
+ yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+ def _infinite_indices(self):
+ g = torch.Generator()
+ g.manual_seed(self._seed)
+ while True:
+ if self._shuffle:
+ yield from torch.randperm(self._size, generator=g).tolist()
+ else:
+ yield from torch.arange(self._size).tolist()
+
+
+class RandomSubsetTrainingSampler(TrainingSampler):
+ """
+ Similar to TrainingSampler, but only sample a random subset of indices.
+ This is useful when you want to estimate the accuracy vs data-number curves by
+ training the model with different subset_ratio.
+ """
+
+ def __init__(
+ self,
+ size: int,
+ subset_ratio: float,
+ shuffle: bool = True,
+ seed_shuffle: Optional[int] = None,
+ seed_subset: Optional[int] = None,
+ ):
+ """
+ Args:
+ size (int): the total number of data of the underlying dataset to sample from
+ subset_ratio (float): the ratio of subset data to sample from the underlying dataset
+ shuffle (bool): whether to shuffle the indices or not
+ seed_shuffle (int): the initial seed of the shuffle. Must be the same
+ across all workers. If None, will use a random seed shared
+ among workers (require synchronization among all workers).
+ seed_subset (int): the seed to randomize the subset to be sampled.
+ Must be the same across all workers. If None, will use a random seed shared
+ among workers (require synchronization among all workers).
+ """
+ super().__init__(size=size, shuffle=shuffle, seed=seed_shuffle)
+
+ assert 0.0 < subset_ratio <= 1.0
+ self._size_subset = int(size * subset_ratio)
+ assert self._size_subset > 0
+ if seed_subset is None:
+ seed_subset = comm.shared_random_seed()
+ self._seed_subset = int(seed_subset)
+
+ # randomly generate the subset indexes to be sampled from
+ g = torch.Generator()
+ g.manual_seed(self._seed_subset)
+ indexes_randperm = torch.randperm(self._size, generator=g)
+ self._indexes_subset = indexes_randperm[: self._size_subset]
+
+ logger.info("Using RandomSubsetTrainingSampler......")
+ logger.info(f"Randomly sample {self._size_subset} data from the original {self._size} data")
+
+ def _infinite_indices(self):
+ g = torch.Generator()
+ g.manual_seed(self._seed) # self._seed equals seed_shuffle from __init__()
+ while True:
+ if self._shuffle:
+ # generate a random permutation to shuffle self._indexes_subset
+ randperm = torch.randperm(self._size_subset, generator=g)
+ yield from self._indexes_subset[randperm].tolist()
+ else:
+ yield from self._indexes_subset.tolist()
+
+
+class RepeatFactorTrainingSampler(Sampler):
+ """
+ Similar to TrainingSampler, but a sample may appear more times than others based
+ on its "repeat factor". This is suitable for training on class imbalanced datasets like LVIS.
+ """
+
+ def __init__(self, repeat_factors, *, shuffle=True, seed=None):
+ """
+ Args:
+ repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's
+ full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``.
+ shuffle (bool): whether to shuffle the indices or not
+ seed (int): the initial seed of the shuffle. Must be the same
+ across all workers. If None, will use a random seed shared
+ among workers (require synchronization among all workers).
+ """
+ self._shuffle = shuffle
+ if seed is None:
+ seed = comm.shared_random_seed()
+ self._seed = int(seed)
+
+ self._rank = comm.get_rank()
+ self._world_size = comm.get_world_size()
+
+ # Split into whole number (_int_part) and fractional (_frac_part) parts.
+ self._int_part = torch.trunc(repeat_factors)
+ self._frac_part = repeat_factors - self._int_part
+
+ @staticmethod
+ def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
+ """
+ Compute (fractional) per-image repeat factors based on category frequency.
+ The repeat factor for an image is a function of the frequency of the rarest
+ category labeled in that image. The "frequency of category c" in [0, 1] is defined
+ as the fraction of images in the training set (without repeats) in which category c
+ appears.
+ See :paper:`lvis` (>= v2) Appendix B.2.
+
+ Args:
+ dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
+ repeat_thresh (float): frequency threshold below which data is repeated.
+ If the frequency is half of `repeat_thresh`, the image will be
+ repeated twice.
+
+ Returns:
+ torch.Tensor:
+ the i-th element is the repeat factor for the dataset image at index i.
+ """
+ # 1. For each category c, compute the fraction of images that contain it: f(c)
+ category_freq = defaultdict(int)
+ for dataset_dict in dataset_dicts: # For each image (without repeats)
+ cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+ for cat_id in cat_ids:
+ category_freq[cat_id] += 1
+ num_images = len(dataset_dicts)
+ for k, v in category_freq.items():
+ category_freq[k] = v / num_images
+
+ # 2. For each category c, compute the category-level repeat factor:
+ # r(c) = max(1, sqrt(t / f(c)))
+ category_rep = {
+ cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+ for cat_id, cat_freq in category_freq.items()
+ }
+
+ # 3. For each image I, compute the image-level repeat factor:
+ # r(I) = max_{c in I} r(c)
+ rep_factors = []
+ for dataset_dict in dataset_dicts:
+ cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+ rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
+ rep_factors.append(rep_factor)
+
+ return torch.tensor(rep_factors, dtype=torch.float32)
+
+ def _get_epoch_indices(self, generator):
+ """
+ Create a list of dataset indices (with repeats) to use for one epoch.
+
+ Args:
+ generator (torch.Generator): pseudo random number generator used for
+ stochastic rounding.
+
+ Returns:
+ torch.Tensor: list of dataset indices to use in one epoch. Each index
+ is repeated based on its calculated repeat factor.
+ """
+ # Since repeat factors are fractional, we use stochastic rounding so
+ # that the target repeat factor is achieved in expectation over the
+ # course of training
+ rands = torch.rand(len(self._frac_part), generator=generator)
+ rep_factors = self._int_part + (rands < self._frac_part).float()
+ # Construct a list of indices in which we repeat images as specified
+ indices = []
+ for dataset_index, rep_factor in enumerate(rep_factors):
+ indices.extend([dataset_index] * int(rep_factor.item()))
+ return torch.tensor(indices, dtype=torch.int64)
+
+ def __iter__(self):
+ start = self._rank
+ yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+ def _infinite_indices(self):
+ g = torch.Generator()
+ g.manual_seed(self._seed)
+ while True:
+ # Sample indices with repeats determined by stochastic rounding; each
+ # "epoch" may have a slightly different size due to the rounding.
+ indices = self._get_epoch_indices(g)
+ if self._shuffle:
+ randperm = torch.randperm(len(indices), generator=g)
+ yield from indices[randperm].tolist()
+ else:
+ yield from indices.tolist()
+
+
+class InferenceSampler(Sampler):
+ """
+ Produce indices for inference across all workers.
+ Inference needs to run on the __exact__ set of samples,
+ therefore when the total number of samples is not divisible by the number of workers,
+ this sampler produces different number of samples on different workers.
+ """
+
+ def __init__(self, size: int):
+ """
+ Args:
+ size (int): the total number of data of the underlying dataset to sample from
+ """
+ self._size = size
+ assert size > 0
+ self._rank = comm.get_rank()
+ self._world_size = comm.get_world_size()
+ self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+ @staticmethod
+ def _get_local_indices(total_size, world_size, rank):
+ shard_size = total_size // world_size
+ left = total_size % world_size
+ shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+ begin = sum(shard_sizes[:rank])
+ end = min(sum(shard_sizes[: rank + 1]), total_size)
+ return range(begin, end)
+
+ def __iter__(self):
+ yield from self._local_indices
+
+ def __len__(self):
+ return len(self._local_indices)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/grouped_batch_sampler.py b/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/grouped_batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d36eaab37d326eb35fbd7ac14a7d711a38240b5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/samplers/grouped_batch_sampler.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+
+class GroupedBatchSampler(BatchSampler):
+ """
+ Wraps another sampler to yield a mini-batch of indices.
+ It enforces that the batch only contain elements from the same group.
+ It also tries to provide mini-batches which follows an ordering which is
+ as close as possible to the ordering from the original sampler.
+ """
+
+ def __init__(self, sampler, group_ids, batch_size):
+ """
+ Args:
+ sampler (Sampler): Base sampler.
+ group_ids (list[int]): If the sampler produces indices in range [0, N),
+ `group_ids` must be a list of `N` ints which contains the group id of each sample.
+ The group ids must be a set of integers in the range [0, num_groups).
+ batch_size (int): Size of mini-batch.
+ """
+ if not isinstance(sampler, Sampler):
+ raise ValueError(
+ "sampler should be an instance of "
+ "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+ )
+ self.sampler = sampler
+ self.group_ids = np.asarray(group_ids)
+ assert self.group_ids.ndim == 1
+ self.batch_size = batch_size
+ groups = np.unique(self.group_ids).tolist()
+
+ # buffer the indices of each group until batch size is reached
+ self.buffer_per_group = {k: [] for k in groups}
+
+ def __iter__(self):
+ for idx in self.sampler:
+ group_id = self.group_ids[idx]
+ group_buffer = self.buffer_per_group[group_id]
+ group_buffer.append(idx)
+ if len(group_buffer) == self.batch_size:
+ yield group_buffer[:] # yield a copy of the list
+ del group_buffer[:]
+
+ def __len__(self):
+ raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7e3cd3bea7a930d9f31823a12b273aae0b7052
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from fvcore.transforms.transform import Transform, TransformList # order them first
+from fvcore.transforms.transform import *
+from .transform import *
+from .augmentation import *
+from .augmentation_impl import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+from custom_detectron2.utils.env import fixup_module_metadata
+
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/augmentation.py b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..22155e0d4eff334d476438b3a5cf3b68059d1e0b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/augmentation.py
@@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import inspect
+import numpy as np
+import pprint
+from typing import Any, List, Optional, Tuple, Union
+from fvcore.transforms.transform import Transform, TransformList
+
+"""
+See "Data Augmentation" tutorial for an overview of the system:
+https://detectron2.readthedocs.io/tutorials/augmentation.html
+"""
+
+
+__all__ = [
+ "Augmentation",
+ "AugmentationList",
+ "AugInput",
+ "TransformGen",
+ "apply_transform_gens",
+ "StandardAugInput",
+ "apply_augmentations",
+]
+
+
+def _check_img_dtype(img):
+ assert isinstance(img, np.ndarray), "[Augmentation] Needs an numpy array, but got a {}!".format(
+ type(img)
+ )
+ assert not isinstance(img.dtype, np.integer) or (
+ img.dtype == np.uint8
+ ), "[Augmentation] Got image of type {}, use uint8 or floating points instead!".format(
+ img.dtype
+ )
+ assert img.ndim in [2, 3], img.ndim
+
+
+def _get_aug_input_args(aug, aug_input) -> List[Any]:
+ """
+ Get the arguments to be passed to ``aug.get_transform`` from the input ``aug_input``.
+ """
+ if aug.input_args is None:
+ # Decide what attributes are needed automatically
+ prms = list(inspect.signature(aug.get_transform).parameters.items())
+ # The default behavior is: if there is one parameter, then its "image"
+ # (work automatically for majority of use cases, and also avoid BC breaking),
+ # Otherwise, use the argument names.
+ if len(prms) == 1:
+ names = ("image",)
+ else:
+ names = []
+ for name, prm in prms:
+ if prm.kind in (
+ inspect.Parameter.VAR_POSITIONAL,
+ inspect.Parameter.VAR_KEYWORD,
+ ):
+ raise TypeError(
+ f""" \
+The default implementation of `{type(aug)}.__call__` does not allow \
+`{type(aug)}.get_transform` to use variable-length arguments (*args, **kwargs)! \
+If arguments are unknown, reimplement `__call__` instead. \
+"""
+ )
+ names.append(name)
+ aug.input_args = tuple(names)
+
+ args = []
+ for f in aug.input_args:
+ try:
+ args.append(getattr(aug_input, f))
+ except AttributeError as e:
+ raise AttributeError(
+ f"{type(aug)}.get_transform needs input attribute '{f}', "
+ f"but it is not an attribute of {type(aug_input)}!"
+ ) from e
+ return args
+
+
+class Augmentation:
+ """
+ Augmentation defines (often random) policies/strategies to generate :class:`Transform`
+ from data. It is often used for pre-processing of input data.
+
+ A "policy" that generates a :class:`Transform` may, in the most general case,
+ need arbitrary information from input data in order to determine what transforms
+ to apply. Therefore, each :class:`Augmentation` instance defines the arguments
+ needed by its :meth:`get_transform` method. When called with the positional arguments,
+ the :meth:`get_transform` method executes the policy.
+
+ Note that :class:`Augmentation` defines the policies to create a :class:`Transform`,
+ but not how to execute the actual transform operations to those data.
+ Its :meth:`__call__` method will use :meth:`AugInput.transform` to execute the transform.
+
+ The returned `Transform` object is meant to describe deterministic transformation, which means
+ it can be re-applied on associated data, e.g. the geometry of an image and its segmentation
+ masks need to be transformed together.
+ (If such re-application is not needed, then determinism is not a crucial requirement.)
+ """
+
+ input_args: Optional[Tuple[str]] = None
+ """
+ Stores the attribute names needed by :meth:`get_transform`, e.g. ``("image", "sem_seg")``.
+ By default, it is just a tuple of argument names in :meth:`self.get_transform`, which often only
+ contain "image". As long as the argument name convention is followed, there is no need for
+ users to touch this attribute.
+ """
+
+ def _init(self, params=None):
+ if params:
+ for k, v in params.items():
+ if k != "self" and not k.startswith("_"):
+ setattr(self, k, v)
+
+ def get_transform(self, *args) -> Transform:
+ """
+ Execute the policy based on input data, and decide what transform to apply to inputs.
+
+ Args:
+ args: Any fixed-length positional arguments. By default, the name of the arguments
+ should exist in the :class:`AugInput` to be used.
+
+ Returns:
+ Transform: Returns the deterministic transform to apply to the input.
+
+ Examples:
+ ::
+ class MyAug:
+ # if a policy needs to know both image and semantic segmentation
+ def get_transform(image, sem_seg) -> T.Transform:
+ pass
+ tfm: Transform = MyAug().get_transform(image, sem_seg)
+ new_image = tfm.apply_image(image)
+
+ Notes:
+ Users can freely use arbitrary new argument names in custom
+ :meth:`get_transform` method, as long as they are available in the
+ input data. In detectron2 we use the following convention:
+
+ * image: (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
+ floating point in range [0, 1] or [0, 255].
+ * boxes: (N,4) ndarray of float32. It represents the instance bounding boxes
+ of N instances. Each is in XYXY format in unit of absolute coordinates.
+ * sem_seg: (H,W) ndarray of type uint8. Each element is an integer label of pixel.
+
+ We do not specify convention for other types and do not include builtin
+ :class:`Augmentation` that uses other types in detectron2.
+ """
+ raise NotImplementedError
+
+ def __call__(self, aug_input) -> Transform:
+ """
+ Augment the given `aug_input` **in-place**, and return the transform that's used.
+
+ This method will be called to apply the augmentation. In most augmentation, it
+ is enough to use the default implementation, which calls :meth:`get_transform`
+ using the inputs. But a subclass can overwrite it to have more complicated logic.
+
+ Args:
+ aug_input (AugInput): an object that has attributes needed by this augmentation
+ (defined by ``self.get_transform``). Its ``transform`` method will be called
+ to in-place transform it.
+
+ Returns:
+ Transform: the transform that is applied on the input.
+ """
+ args = _get_aug_input_args(self, aug_input)
+ tfm = self.get_transform(*args)
+ assert isinstance(tfm, (Transform, TransformList)), (
+ f"{type(self)}.get_transform must return an instance of Transform! "
+ f"Got {type(tfm)} instead."
+ )
+ aug_input.transform(tfm)
+ return tfm
+
+ def _rand_range(self, low=1.0, high=None, size=None):
+ """
+ Uniform float random number between low and high.
+ """
+ if high is None:
+ low, high = 0, low
+ if size is None:
+ size = []
+ return np.random.uniform(low, high, size)
+
+ def __repr__(self):
+ """
+ Produce something like:
+ "MyAugmentation(field1={self.field1}, field2={self.field2})"
+ """
+ try:
+ sig = inspect.signature(self.__init__)
+ classname = type(self).__name__
+ argstr = []
+ for name, param in sig.parameters.items():
+ assert (
+ param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD
+ ), "The default __repr__ doesn't support *args or **kwargs"
+ assert hasattr(self, name), (
+ "Attribute {} not found! "
+ "Default __repr__ only works if attributes match the constructor.".format(name)
+ )
+ attr = getattr(self, name)
+ default = param.default
+ if default is attr:
+ continue
+ attr_str = pprint.pformat(attr)
+ if "\n" in attr_str:
+ # don't show it if pformat decides to use >1 lines
+ attr_str = "..."
+ argstr.append("{}={}".format(name, attr_str))
+ return "{}({})".format(classname, ", ".join(argstr))
+ except AssertionError:
+ return super().__repr__()
+
+ __str__ = __repr__
+
+
+class _TransformToAug(Augmentation):
+ def __init__(self, tfm: Transform):
+ self.tfm = tfm
+
+ def get_transform(self, *args):
+ return self.tfm
+
+ def __repr__(self):
+ return repr(self.tfm)
+
+ __str__ = __repr__
+
+
+def _transform_to_aug(tfm_or_aug):
+ """
+ Wrap Transform into Augmentation.
+ Private, used internally to implement augmentations.
+ """
+ assert isinstance(tfm_or_aug, (Transform, Augmentation)), tfm_or_aug
+ if isinstance(tfm_or_aug, Augmentation):
+ return tfm_or_aug
+ else:
+ return _TransformToAug(tfm_or_aug)
+
+
+class AugmentationList(Augmentation):
+ """
+ Apply a sequence of augmentations.
+
+ It has ``__call__`` method to apply the augmentations.
+
+ Note that :meth:`get_transform` method is impossible (will throw error if called)
+ for :class:`AugmentationList`, because in order to apply a sequence of augmentations,
+ the kth augmentation must be applied first, to provide inputs needed by the (k+1)th
+ augmentation.
+ """
+
+ def __init__(self, augs):
+ """
+ Args:
+ augs (list[Augmentation or Transform]):
+ """
+ super().__init__()
+ self.augs = [_transform_to_aug(x) for x in augs]
+
+ def __call__(self, aug_input) -> TransformList:
+ tfms = []
+ for x in self.augs:
+ tfm = x(aug_input)
+ tfms.append(tfm)
+ return TransformList(tfms)
+
+ def __repr__(self):
+ msgs = [str(x) for x in self.augs]
+ return "AugmentationList[{}]".format(", ".join(msgs))
+
+ __str__ = __repr__
+
+
+class AugInput:
+ """
+ Input that can be used with :meth:`Augmentation.__call__`.
+ This is a standard implementation for the majority of use cases.
+ This class provides the standard attributes **"image", "boxes", "sem_seg"**
+ defined in :meth:`__init__` and they may be needed by different augmentations.
+ Most augmentation policies do not need attributes beyond these three.
+
+ After applying augmentations to these attributes (using :meth:`AugInput.transform`),
+ the returned transforms can then be used to transform other data structures that users have.
+
+ Examples:
+ ::
+ input = AugInput(image, boxes=boxes)
+ tfms = augmentation(input)
+ transformed_image = input.image
+ transformed_boxes = input.boxes
+ transformed_other_data = tfms.apply_other(other_data)
+
+ An extended project that works with new data types may implement augmentation policies
+ that need other inputs. An algorithm may need to transform inputs in a way different
+ from the standard approach defined in this class. In those rare situations, users can
+ implement a class similar to this class, that satify the following condition:
+
+ * The input must provide access to these data in the form of attribute access
+ (``getattr``). For example, if an :class:`Augmentation` to be applied needs "image"
+ and "sem_seg" arguments, its input must have the attribute "image" and "sem_seg".
+ * The input must have a ``transform(tfm: Transform) -> None`` method which
+ in-place transforms all its attributes.
+ """
+
+ # TODO maybe should support more builtin data types here
+ def __init__(
+ self,
+ image: np.ndarray,
+ *,
+ boxes: Optional[np.ndarray] = None,
+ sem_seg: Optional[np.ndarray] = None,
+ ):
+ """
+ Args:
+ image (ndarray): (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
+ floating point in range [0, 1] or [0, 255]. The meaning of C is up
+ to users.
+ boxes (ndarray or None): Nx4 float32 boxes in XYXY_ABS mode
+ sem_seg (ndarray or None): HxW uint8 semantic segmentation mask. Each element
+ is an integer label of pixel.
+ """
+ _check_img_dtype(image)
+ self.image = image
+ self.boxes = boxes
+ self.sem_seg = sem_seg
+
+ def transform(self, tfm: Transform) -> None:
+ """
+ In-place transform all attributes of this class.
+
+ By "in-place", it means after calling this method, accessing an attribute such
+ as ``self.image`` will return transformed data.
+ """
+ self.image = tfm.apply_image(self.image)
+ if self.boxes is not None:
+ self.boxes = tfm.apply_box(self.boxes)
+ if self.sem_seg is not None:
+ self.sem_seg = tfm.apply_segmentation(self.sem_seg)
+
+ def apply_augmentations(
+ self, augmentations: List[Union[Augmentation, Transform]]
+ ) -> TransformList:
+ """
+ Equivalent of ``AugmentationList(augmentations)(self)``
+ """
+ return AugmentationList(augmentations)(self)
+
+
+def apply_augmentations(augmentations: List[Union[Transform, Augmentation]], inputs):
+ """
+ Use ``T.AugmentationList(augmentations)(inputs)`` instead.
+ """
+ if isinstance(inputs, np.ndarray):
+ # handle the common case of image-only Augmentation, also for backward compatibility
+ image_only = True
+ inputs = AugInput(inputs)
+ else:
+ image_only = False
+ tfms = inputs.apply_augmentations(augmentations)
+ return inputs.image if image_only else inputs, tfms
+
+
+apply_transform_gens = apply_augmentations
+"""
+Alias for backward-compatibility.
+"""
+
+TransformGen = Augmentation
+"""
+Alias for Augmentation, since it is something that generates :class:`Transform`s
+"""
+
+StandardAugInput = AugInput
+"""
+Alias for compatibility. It's not worth the complexity to have two classes.
+"""
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/augmentation_impl.py b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/augmentation_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6edca0cded4f83b40bd362f70c4bdee5965eeb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/augmentation_impl.py
@@ -0,0 +1,736 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Implement many useful :class:`Augmentation`.
+"""
+import numpy as np
+import sys
+from numpy import random
+from typing import Tuple
+import torch
+from fvcore.transforms.transform import (
+ BlendTransform,
+ CropTransform,
+ HFlipTransform,
+ NoOpTransform,
+ PadTransform,
+ Transform,
+ TransformList,
+ VFlipTransform,
+)
+from PIL import Image
+
+from custom_detectron2.structures import Boxes, pairwise_iou
+
+from .augmentation import Augmentation, _transform_to_aug
+from .transform import ExtentTransform, ResizeTransform, RotationTransform
+
+__all__ = [
+ "FixedSizeCrop",
+ "RandomApply",
+ "RandomBrightness",
+ "RandomContrast",
+ "RandomCrop",
+ "RandomExtent",
+ "RandomFlip",
+ "RandomSaturation",
+ "RandomLighting",
+ "RandomRotation",
+ "Resize",
+ "ResizeScale",
+ "ResizeShortestEdge",
+ "RandomCrop_CategoryAreaConstraint",
+ "RandomResize",
+ "MinIoURandomCrop",
+]
+
+
+class RandomApply(Augmentation):
+ """
+ Randomly apply an augmentation with a given probability.
+ """
+
+ def __init__(self, tfm_or_aug, prob=0.5):
+ """
+ Args:
+ tfm_or_aug (Transform, Augmentation): the transform or augmentation
+ to be applied. It can either be a `Transform` or `Augmentation`
+ instance.
+ prob (float): probability between 0.0 and 1.0 that
+ the wrapper transformation is applied
+ """
+ super().__init__()
+ self.aug = _transform_to_aug(tfm_or_aug)
+ assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})"
+ self.prob = prob
+
+ def get_transform(self, *args):
+ do = self._rand_range() < self.prob
+ if do:
+ return self.aug.get_transform(*args)
+ else:
+ return NoOpTransform()
+
+ def __call__(self, aug_input):
+ do = self._rand_range() < self.prob
+ if do:
+ return self.aug(aug_input)
+ else:
+ return NoOpTransform()
+
+
+class RandomFlip(Augmentation):
+ """
+ Flip the image horizontally or vertically with the given probability.
+ """
+
+ def __init__(self, prob=0.5, *, horizontal=True, vertical=False):
+ """
+ Args:
+ prob (float): probability of flip.
+ horizontal (boolean): whether to apply horizontal flipping
+ vertical (boolean): whether to apply vertical flipping
+ """
+ super().__init__()
+
+ if horizontal and vertical:
+ raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
+ if not horizontal and not vertical:
+ raise ValueError("At least one of horiz or vert has to be True!")
+ self._init(locals())
+
+ def get_transform(self, image):
+ h, w = image.shape[:2]
+ do = self._rand_range() < self.prob
+ if do:
+ if self.horizontal:
+ return HFlipTransform(w)
+ elif self.vertical:
+ return VFlipTransform(h)
+ else:
+ return NoOpTransform()
+
+
+class Resize(Augmentation):
+ """Resize image to a fixed target size"""
+
+ def __init__(self, shape, interp=Image.BILINEAR):
+ """
+ Args:
+ shape: (h, w) tuple or a int
+ interp: PIL interpolation method
+ """
+ if isinstance(shape, int):
+ shape = (shape, shape)
+ shape = tuple(shape)
+ self._init(locals())
+
+ def get_transform(self, image):
+ return ResizeTransform(
+ image.shape[0], image.shape[1], self.shape[0], self.shape[1], self.interp
+ )
+
+
+class ResizeShortestEdge(Augmentation):
+ """
+ Resize the image while keeping the aspect ratio unchanged.
+ It attempts to scale the shorter edge to the given `short_edge_length`,
+ as long as the longer edge does not exceed `max_size`.
+ If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+ """
+
+ @torch.jit.unused
+ def __init__(
+ self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR
+ ):
+ """
+ Args:
+ short_edge_length (list[int]): If ``sample_style=="range"``,
+ a [min, max] interval from which to sample the shortest edge length.
+ If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
+ max_size (int): maximum allowed longest edge length.
+ sample_style (str): either "range" or "choice".
+ """
+ super().__init__()
+ assert sample_style in ["range", "choice"], sample_style
+
+ self.is_range = sample_style == "range"
+ if isinstance(short_edge_length, int):
+ short_edge_length = (short_edge_length, short_edge_length)
+ if self.is_range:
+ assert len(short_edge_length) == 2, (
+ "short_edge_length must be two values using 'range' sample style."
+ f" Got {short_edge_length}!"
+ )
+ self._init(locals())
+
+ @torch.jit.unused
+ def get_transform(self, image):
+ h, w = image.shape[:2]
+ if self.is_range:
+ size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
+ else:
+ size = np.random.choice(self.short_edge_length)
+ if size == 0:
+ return NoOpTransform()
+
+ newh, neww = ResizeShortestEdge.get_output_shape(h, w, size, self.max_size)
+ return ResizeTransform(h, w, newh, neww, self.interp)
+
+ @staticmethod
+ def get_output_shape(
+ oldh: int, oldw: int, short_edge_length: int, max_size: int
+ ) -> Tuple[int, int]:
+ """
+ Compute the output size given input size and target short edge length.
+ """
+ h, w = oldh, oldw
+ size = short_edge_length * 1.0
+ scale = size / min(h, w)
+ if h < w:
+ newh, neww = size, scale * w
+ else:
+ newh, neww = scale * h, size
+ if max(newh, neww) > max_size:
+ scale = max_size * 1.0 / max(newh, neww)
+ newh = newh * scale
+ neww = neww * scale
+ neww = int(neww + 0.5)
+ newh = int(newh + 0.5)
+ return (newh, neww)
+
+
+class ResizeScale(Augmentation):
+ """
+ Takes target size as input and randomly scales the given target size between `min_scale`
+ and `max_scale`. It then scales the input image such that it fits inside the scaled target
+ box, keeping the aspect ratio constant.
+ This implements the resize part of the Google's 'resize_and_crop' data augmentation:
+ https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/input_utils.py#L127
+ """
+
+ def __init__(
+ self,
+ min_scale: float,
+ max_scale: float,
+ target_height: int,
+ target_width: int,
+ interp: int = Image.BILINEAR,
+ ):
+ """
+ Args:
+ min_scale: minimum image scale range.
+ max_scale: maximum image scale range.
+ target_height: target image height.
+ target_width: target image width.
+ interp: image interpolation method.
+ """
+ super().__init__()
+ self._init(locals())
+
+ def _get_resize(self, image: np.ndarray, scale: float) -> Transform:
+ input_size = image.shape[:2]
+
+ # Compute new target size given a scale.
+ target_size = (self.target_height, self.target_width)
+ target_scale_size = np.multiply(target_size, scale)
+
+ # Compute actual rescaling applied to input image and output size.
+ output_scale = np.minimum(
+ target_scale_size[0] / input_size[0], target_scale_size[1] / input_size[1]
+ )
+ output_size = np.round(np.multiply(input_size, output_scale)).astype(int)
+
+ return ResizeTransform(
+ input_size[0], input_size[1], output_size[0], output_size[1], self.interp
+ )
+
+ def get_transform(self, image: np.ndarray) -> Transform:
+ random_scale = np.random.uniform(self.min_scale, self.max_scale)
+ return self._get_resize(image, random_scale)
+
+
+class RandomRotation(Augmentation):
+ """
+ This method returns a copy of this image, rotated the given
+ number of degrees counter clockwise around the given center.
+ """
+
+ def __init__(self, angle, expand=True, center=None, sample_style="range", interp=None):
+ """
+ Args:
+ angle (list[float]): If ``sample_style=="range"``,
+ a [min, max] interval from which to sample the angle (in degrees).
+ If ``sample_style=="choice"``, a list of angles to sample from
+ expand (bool): choose if the image should be resized to fit the whole
+ rotated image (default), or simply cropped
+ center (list[[float, float]]): If ``sample_style=="range"``,
+ a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center,
+ [0, 0] being the top left of the image and [1, 1] the bottom right.
+ If ``sample_style=="choice"``, a list of centers to sample from
+ Default: None, which means that the center of rotation is the center of the image
+ center has no effect if expand=True because it only affects shifting
+ """
+ super().__init__()
+ assert sample_style in ["range", "choice"], sample_style
+ self.is_range = sample_style == "range"
+ if isinstance(angle, (float, int)):
+ angle = (angle, angle)
+ if center is not None and isinstance(center[0], (float, int)):
+ center = (center, center)
+ self._init(locals())
+
+ def get_transform(self, image):
+ h, w = image.shape[:2]
+ center = None
+ if self.is_range:
+ angle = np.random.uniform(self.angle[0], self.angle[1])
+ if self.center is not None:
+ center = (
+ np.random.uniform(self.center[0][0], self.center[1][0]),
+ np.random.uniform(self.center[0][1], self.center[1][1]),
+ )
+ else:
+ angle = np.random.choice(self.angle)
+ if self.center is not None:
+ center = np.random.choice(self.center)
+
+ if center is not None:
+ center = (w * center[0], h * center[1]) # Convert to absolute coordinates
+
+ if angle % 360 == 0:
+ return NoOpTransform()
+
+ return RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp)
+
+
+class FixedSizeCrop(Augmentation):
+ """
+ If `crop_size` is smaller than the input image size, then it uses a random crop of
+ the crop size. If `crop_size` is larger than the input image size, then it pads
+ the right and the bottom of the image to the crop size if `pad` is True, otherwise
+ it returns the smaller image.
+ """
+
+ def __init__(
+ self,
+ crop_size: Tuple[int],
+ pad: bool = True,
+ pad_value: float = 128.0,
+ seg_pad_value: int = 255,
+ ):
+ """
+ Args:
+ crop_size: target image (height, width).
+ pad: if True, will pad images smaller than `crop_size` up to `crop_size`
+ pad_value: the padding value to the image.
+ seg_pad_value: the padding value to the segmentation mask.
+ """
+ super().__init__()
+ self._init(locals())
+
+ def _get_crop(self, image: np.ndarray) -> Transform:
+ # Compute the image scale and scaled size.
+ input_size = image.shape[:2]
+ output_size = self.crop_size
+
+ # Add random crop if the image is scaled up.
+ max_offset = np.subtract(input_size, output_size)
+ max_offset = np.maximum(max_offset, 0)
+ offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0))
+ offset = np.round(offset).astype(int)
+ return CropTransform(
+ offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
+ )
+
+ def _get_pad(self, image: np.ndarray) -> Transform:
+ # Compute the image scale and scaled size.
+ input_size = image.shape[:2]
+ output_size = self.crop_size
+
+ # Add padding if the image is scaled down.
+ pad_size = np.subtract(output_size, input_size)
+ pad_size = np.maximum(pad_size, 0)
+ original_size = np.minimum(input_size, output_size)
+ return PadTransform(
+ 0,
+ 0,
+ pad_size[1],
+ pad_size[0],
+ original_size[1],
+ original_size[0],
+ self.pad_value,
+ self.seg_pad_value,
+ )
+
+ def get_transform(self, image: np.ndarray) -> TransformList:
+ transforms = [self._get_crop(image)]
+ if self.pad:
+ transforms.append(self._get_pad(image))
+ return TransformList(transforms)
+
+
+class RandomCrop(Augmentation):
+ """
+ Randomly crop a rectangle region out of an image.
+ """
+
+ def __init__(self, crop_type: str, crop_size):
+ """
+ Args:
+ crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range".
+ crop_size (tuple[float, float]): two floats, explained below.
+
+ - "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of
+ size (H, W). crop size should be in (0, 1]
+ - "relative_range": uniformly sample two values from [crop_size[0], 1]
+ and [crop_size[1]], 1], and use them as in "relative" crop type.
+ - "absolute" crop a (crop_size[0], crop_size[1]) region from input image.
+ crop_size must be smaller than the input image size.
+ - "absolute_range", for an input of size (H, W), uniformly sample H_crop in
+ [crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])].
+ Then crop a region (H_crop, W_crop).
+ """
+ # TODO style of relative_range and absolute_range are not consistent:
+ # one takes (h, w) but another takes (min, max)
+ super().__init__()
+ assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"]
+ self._init(locals())
+
+ def get_transform(self, image):
+ h, w = image.shape[:2]
+ croph, cropw = self.get_crop_size((h, w))
+ assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
+ h0 = np.random.randint(h - croph + 1)
+ w0 = np.random.randint(w - cropw + 1)
+ return CropTransform(w0, h0, cropw, croph)
+
+ def get_crop_size(self, image_size):
+ """
+ Args:
+ image_size (tuple): height, width
+
+ Returns:
+ crop_size (tuple): height, width in absolute pixels
+ """
+ h, w = image_size
+ if self.crop_type == "relative":
+ ch, cw = self.crop_size
+ return int(h * ch + 0.5), int(w * cw + 0.5)
+ elif self.crop_type == "relative_range":
+ crop_size = np.asarray(self.crop_size, dtype=np.float32)
+ ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
+ return int(h * ch + 0.5), int(w * cw + 0.5)
+ elif self.crop_type == "absolute":
+ return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+ elif self.crop_type == "absolute_range":
+ assert self.crop_size[0] <= self.crop_size[1]
+ ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1)
+ cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1)
+ return ch, cw
+ else:
+ raise NotImplementedError("Unknown crop type {}".format(self.crop_type))
+
+
+class RandomCrop_CategoryAreaConstraint(Augmentation):
+ """
+ Similar to :class:`RandomCrop`, but find a cropping window such that no single category
+ occupies a ratio of more than `single_category_max_area` in semantic segmentation ground
+ truth, which can cause unstability in training. The function attempts to find such a valid
+ cropping window for at most 10 times.
+ """
+
+ def __init__(
+ self,
+ crop_type: str,
+ crop_size,
+ single_category_max_area: float = 1.0,
+ ignored_category: int = None,
+ ):
+ """
+ Args:
+ crop_type, crop_size: same as in :class:`RandomCrop`
+ single_category_max_area: the maximum allowed area ratio of a
+ category. Set to 1.0 to disable
+ ignored_category: allow this category in the semantic segmentation
+ ground truth to exceed the area ratio. Usually set to the category
+ that's ignored in training.
+ """
+ self.crop_aug = RandomCrop(crop_type, crop_size)
+ self._init(locals())
+
+ def get_transform(self, image, sem_seg):
+ if self.single_category_max_area >= 1.0:
+ return self.crop_aug.get_transform(image)
+ else:
+ h, w = sem_seg.shape
+ for _ in range(10):
+ crop_size = self.crop_aug.get_crop_size((h, w))
+ y0 = np.random.randint(h - crop_size[0] + 1)
+ x0 = np.random.randint(w - crop_size[1] + 1)
+ sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]]
+ labels, cnt = np.unique(sem_seg_temp, return_counts=True)
+ if self.ignored_category is not None:
+ cnt = cnt[labels != self.ignored_category]
+ if len(cnt) > 1 and np.max(cnt) < np.sum(cnt) * self.single_category_max_area:
+ break
+ crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0])
+ return crop_tfm
+
+
+class RandomExtent(Augmentation):
+ """
+ Outputs an image by cropping a random "subrect" of the source image.
+
+ The subrect can be parameterized to include pixels outside the source image,
+ in which case they will be set to zeros (i.e. black). The size of the output
+ image will vary with the size of the random subrect.
+ """
+
+ def __init__(self, scale_range, shift_range):
+ """
+ Args:
+ output_size (h, w): Dimensions of output image
+ scale_range (l, h): Range of input-to-output size scaling factor
+ shift_range (x, y): Range of shifts of the cropped subrect. The rect
+ is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)],
+ where (w, h) is the (width, height) of the input image. Set each
+ component to zero to crop at the image's center.
+ """
+ super().__init__()
+ self._init(locals())
+
+ def get_transform(self, image):
+ img_h, img_w = image.shape[:2]
+
+ # Initialize src_rect to fit the input image.
+ src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h])
+
+ # Apply a random scaling to the src_rect.
+ src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1])
+
+ # Apply a random shift to the coordinates origin.
+ src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5)
+ src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5)
+
+ # Map src_rect coordinates into image coordinates (center at corner).
+ src_rect[0::2] += 0.5 * img_w
+ src_rect[1::2] += 0.5 * img_h
+
+ return ExtentTransform(
+ src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]),
+ output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])),
+ )
+
+
+class RandomContrast(Augmentation):
+ """
+ Randomly transforms image contrast.
+
+ Contrast intensity is uniformly sampled in (intensity_min, intensity_max).
+ - intensity < 1 will reduce contrast
+ - intensity = 1 will preserve the input image
+ - intensity > 1 will increase contrast
+
+ See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+ """
+
+ def __init__(self, intensity_min, intensity_max):
+ """
+ Args:
+ intensity_min (float): Minimum augmentation
+ intensity_max (float): Maximum augmentation
+ """
+ super().__init__()
+ self._init(locals())
+
+ def get_transform(self, image):
+ w = np.random.uniform(self.intensity_min, self.intensity_max)
+ return BlendTransform(src_image=image.mean(), src_weight=1 - w, dst_weight=w)
+
+
+class RandomBrightness(Augmentation):
+ """
+ Randomly transforms image brightness.
+
+ Brightness intensity is uniformly sampled in (intensity_min, intensity_max).
+ - intensity < 1 will reduce brightness
+ - intensity = 1 will preserve the input image
+ - intensity > 1 will increase brightness
+
+ See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+ """
+
+ def __init__(self, intensity_min, intensity_max):
+ """
+ Args:
+ intensity_min (float): Minimum augmentation
+ intensity_max (float): Maximum augmentation
+ """
+ super().__init__()
+ self._init(locals())
+
+ def get_transform(self, image):
+ w = np.random.uniform(self.intensity_min, self.intensity_max)
+ return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w)
+
+
+class RandomSaturation(Augmentation):
+ """
+ Randomly transforms saturation of an RGB image.
+ Input images are assumed to have 'RGB' channel order.
+
+ Saturation intensity is uniformly sampled in (intensity_min, intensity_max).
+ - intensity < 1 will reduce saturation (make the image more grayscale)
+ - intensity = 1 will preserve the input image
+ - intensity > 1 will increase saturation
+
+ See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+ """
+
+ def __init__(self, intensity_min, intensity_max):
+ """
+ Args:
+ intensity_min (float): Minimum augmentation (1 preserves input).
+ intensity_max (float): Maximum augmentation (1 preserves input).
+ """
+ super().__init__()
+ self._init(locals())
+
+ def get_transform(self, image):
+ assert image.shape[-1] == 3, "RandomSaturation only works on RGB images"
+ w = np.random.uniform(self.intensity_min, self.intensity_max)
+ grayscale = image.dot([0.299, 0.587, 0.114])[:, :, np.newaxis]
+ return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w)
+
+
+class RandomLighting(Augmentation):
+ """
+ The "lighting" augmentation described in AlexNet, using fixed PCA over ImageNet.
+ Input images are assumed to have 'RGB' channel order.
+
+ The degree of color jittering is randomly sampled via a normal distribution,
+ with standard deviation given by the scale parameter.
+ """
+
+ def __init__(self, scale):
+ """
+ Args:
+ scale (float): Standard deviation of principal component weighting.
+ """
+ super().__init__()
+ self._init(locals())
+ self.eigen_vecs = np.array(
+ [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]]
+ )
+ self.eigen_vals = np.array([0.2175, 0.0188, 0.0045])
+
+ def get_transform(self, image):
+ assert image.shape[-1] == 3, "RandomLighting only works on RGB images"
+ weights = np.random.normal(scale=self.scale, size=3)
+ return BlendTransform(
+ src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0
+ )
+
+
+class RandomResize(Augmentation):
+ """Randomly resize image to a target size in shape_list"""
+
+ def __init__(self, shape_list, interp=Image.BILINEAR):
+ """
+ Args:
+ shape_list: a list of shapes in (h, w)
+ interp: PIL interpolation method
+ """
+ self.shape_list = shape_list
+ self._init(locals())
+
+ def get_transform(self, image):
+ shape_idx = np.random.randint(low=0, high=len(self.shape_list))
+ h, w = self.shape_list[shape_idx]
+ return ResizeTransform(image.shape[0], image.shape[1], h, w, self.interp)
+
+
+class MinIoURandomCrop(Augmentation):
+ """Random crop the image & bboxes, the cropped patches have minimum IoU
+ requirement with original image & bboxes, the IoU threshold is randomly
+ selected from min_ious.
+
+ Args:
+ min_ious (tuple): minimum IoU threshold for all intersections with
+ bounding boxes
+ min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+ where a >= min_crop_size)
+ mode_trials: number of trials for sampling min_ious threshold
+ crop_trials: number of trials for sampling crop_size after cropping
+ """
+
+ def __init__(
+ self,
+ min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+ min_crop_size=0.3,
+ mode_trials=1000,
+ crop_trials=50,
+ ):
+ self.min_ious = min_ious
+ self.sample_mode = (1, *min_ious, 0)
+ self.min_crop_size = min_crop_size
+ self.mode_trials = mode_trials
+ self.crop_trials = crop_trials
+
+ def get_transform(self, image, boxes):
+ """Call function to crop images and bounding boxes with minimum IoU
+ constraint.
+
+ Args:
+ boxes: ground truth boxes in (x1, y1, x2, y2) format
+ """
+ if boxes is None:
+ return NoOpTransform()
+ h, w, c = image.shape
+ for _ in range(self.mode_trials):
+ mode = random.choice(self.sample_mode)
+ self.mode = mode
+ if mode == 1:
+ return NoOpTransform()
+
+ min_iou = mode
+ for _ in range(self.crop_trials):
+ new_w = random.uniform(self.min_crop_size * w, w)
+ new_h = random.uniform(self.min_crop_size * h, h)
+
+ # h / w in [0.5, 2]
+ if new_h / new_w < 0.5 or new_h / new_w > 2:
+ continue
+
+ left = random.uniform(w - new_w)
+ top = random.uniform(h - new_h)
+
+ patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h)))
+ # Line or point crop is not allowed
+ if patch[2] == patch[0] or patch[3] == patch[1]:
+ continue
+ overlaps = pairwise_iou(
+ Boxes(patch.reshape(-1, 4)), Boxes(boxes.reshape(-1, 4))
+ ).reshape(-1)
+ if len(overlaps) > 0 and overlaps.min() < min_iou:
+ continue
+
+ # center of boxes should inside the crop img
+ # only adjust boxes and instance masks when the gt is not empty
+ if len(overlaps) > 0:
+ # adjust boxes
+ def is_center_of_bboxes_in_patch(boxes, patch):
+ center = (boxes[:, :2] + boxes[:, 2:]) / 2
+ mask = (
+ (center[:, 0] > patch[0])
+ * (center[:, 1] > patch[1])
+ * (center[:, 0] < patch[2])
+ * (center[:, 1] < patch[3])
+ )
+ return mask
+
+ mask = is_center_of_bboxes_in_patch(boxes, patch)
+ if not mask.any():
+ continue
+ return CropTransform(int(left), int(top), int(new_w), int(new_h))
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/transform.py b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..eccb106bab5f5bb235da82d924572df4fc41b87d
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/data/transforms/transform.py
@@ -0,0 +1,351 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+See "Data Augmentation" tutorial for an overview of the system:
+https://detectron2.readthedocs.io/tutorials/augmentation.html
+"""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fvcore.transforms.transform import (
+ CropTransform,
+ HFlipTransform,
+ NoOpTransform,
+ Transform,
+ TransformList,
+)
+from PIL import Image
+
+try:
+ import cv2 # noqa
+except ImportError:
+ # OpenCV is an optional dependency at the moment
+ pass
+
+__all__ = [
+ "ExtentTransform",
+ "ResizeTransform",
+ "RotationTransform",
+ "ColorTransform",
+ "PILColorTransform",
+]
+
+
+class ExtentTransform(Transform):
+ """
+ Extracts a subregion from the source image and scales it to the output size.
+
+ The fill color is used to map pixels from the source rect that fall outside
+ the source image.
+
+ See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform
+ """
+
+ def __init__(self, src_rect, output_size, interp=Image.BILINEAR, fill=0):
+ """
+ Args:
+ src_rect (x0, y0, x1, y1): src coordinates
+ output_size (h, w): dst image size
+ interp: PIL interpolation methods
+ fill: Fill color used when src_rect extends outside image
+ """
+ super().__init__()
+ self._set_attributes(locals())
+
+ def apply_image(self, img, interp=None):
+ h, w = self.output_size
+ if len(img.shape) > 2 and img.shape[2] == 1:
+ pil_image = Image.fromarray(img[:, :, 0], mode="L")
+ else:
+ pil_image = Image.fromarray(img)
+ pil_image = pil_image.transform(
+ size=(w, h),
+ method=Image.EXTENT,
+ data=self.src_rect,
+ resample=interp if interp else self.interp,
+ fill=self.fill,
+ )
+ ret = np.asarray(pil_image)
+ if len(img.shape) > 2 and img.shape[2] == 1:
+ ret = np.expand_dims(ret, -1)
+ return ret
+
+ def apply_coords(self, coords):
+ # Transform image center from source coordinates into output coordinates
+ # and then map the new origin to the corner of the output image.
+ h, w = self.output_size
+ x0, y0, x1, y1 = self.src_rect
+ new_coords = coords.astype(np.float32)
+ new_coords[:, 0] -= 0.5 * (x0 + x1)
+ new_coords[:, 1] -= 0.5 * (y0 + y1)
+ new_coords[:, 0] *= w / (x1 - x0)
+ new_coords[:, 1] *= h / (y1 - y0)
+ new_coords[:, 0] += 0.5 * w
+ new_coords[:, 1] += 0.5 * h
+ return new_coords
+
+ def apply_segmentation(self, segmentation):
+ segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+ return segmentation
+
+
+class ResizeTransform(Transform):
+ """
+ Resize the image to a target size.
+ """
+
+ def __init__(self, h, w, new_h, new_w, interp=None):
+ """
+ Args:
+ h, w (int): original image size
+ new_h, new_w (int): new image size
+ interp: PIL interpolation methods, defaults to bilinear.
+ """
+ # TODO decide on PIL vs opencv
+ super().__init__()
+ if interp is None:
+ interp = Image.BILINEAR
+ self._set_attributes(locals())
+
+ def apply_image(self, img, interp=None):
+ assert img.shape[:2] == (self.h, self.w)
+ assert len(img.shape) <= 4
+ interp_method = interp if interp is not None else self.interp
+
+ if img.dtype == np.uint8:
+ if len(img.shape) > 2 and img.shape[2] == 1:
+ pil_image = Image.fromarray(img[:, :, 0], mode="L")
+ else:
+ pil_image = Image.fromarray(img)
+ pil_image = pil_image.resize((self.new_w, self.new_h), interp_method)
+ ret = np.asarray(pil_image)
+ if len(img.shape) > 2 and img.shape[2] == 1:
+ ret = np.expand_dims(ret, -1)
+ else:
+ # PIL only supports uint8
+ if any(x < 0 for x in img.strides):
+ img = np.ascontiguousarray(img)
+ img = torch.from_numpy(img)
+ shape = list(img.shape)
+ shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
+ img = img.view(shape_4d).permute(2, 3, 0, 1) # hw(c) -> nchw
+ _PIL_RESIZE_TO_INTERPOLATE_MODE = {
+ Image.NEAREST: "nearest",
+ Image.BILINEAR: "bilinear",
+ Image.BICUBIC: "bicubic",
+ }
+ mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method]
+ align_corners = None if mode == "nearest" else False
+ img = F.interpolate(
+ img, (self.new_h, self.new_w), mode=mode, align_corners=align_corners
+ )
+ shape[:2] = (self.new_h, self.new_w)
+ ret = img.permute(2, 3, 0, 1).view(shape).numpy() # nchw -> hw(c)
+
+ return ret
+
+ def apply_coords(self, coords):
+ coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w)
+ coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h)
+ return coords
+
+ def apply_segmentation(self, segmentation):
+ segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+ return segmentation
+
+ def inverse(self):
+ return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp)
+
+
+class RotationTransform(Transform):
+ """
+ This method returns a copy of this image, rotated the given
+ number of degrees counter clockwise around its center.
+ """
+
+ def __init__(self, h, w, angle, expand=True, center=None, interp=None):
+ """
+ Args:
+ h, w (int): original image size
+ angle (float): degrees for rotation
+ expand (bool): choose if the image should be resized to fit the whole
+ rotated image (default), or simply cropped
+ center (tuple (width, height)): coordinates of the rotation center
+ if left to None, the center will be fit to the center of each image
+ center has no effect if expand=True because it only affects shifting
+ interp: cv2 interpolation method, default cv2.INTER_LINEAR
+ """
+ super().__init__()
+ image_center = np.array((w / 2, h / 2))
+ if center is None:
+ center = image_center
+ if interp is None:
+ interp = cv2.INTER_LINEAR
+ abs_cos, abs_sin = (abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle))))
+ if expand:
+ # find the new width and height bounds
+ bound_w, bound_h = np.rint(
+ [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin]
+ ).astype(int)
+ else:
+ bound_w, bound_h = w, h
+
+ self._set_attributes(locals())
+ self.rm_coords = self.create_rotation_matrix()
+ # Needed because of this problem https://github.com/opencv/opencv/issues/11784
+ self.rm_image = self.create_rotation_matrix(offset=-0.5)
+
+ def apply_image(self, img, interp=None):
+ """
+ img should be a numpy array, formatted as Height * Width * Nchannels
+ """
+ if len(img) == 0 or self.angle % 360 == 0:
+ return img
+ assert img.shape[:2] == (self.h, self.w)
+ interp = interp if interp is not None else self.interp
+ return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp)
+
+ def apply_coords(self, coords):
+ """
+ coords should be a N * 2 array-like, containing N couples of (x, y) points
+ """
+ coords = np.asarray(coords, dtype=float)
+ if len(coords) == 0 or self.angle % 360 == 0:
+ return coords
+ return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :]
+
+ def apply_segmentation(self, segmentation):
+ segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST)
+ return segmentation
+
+ def create_rotation_matrix(self, offset=0):
+ center = (self.center[0] + offset, self.center[1] + offset)
+ rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1)
+ if self.expand:
+ # Find the coordinates of the center of rotation in the new image
+ # The only point for which we know the future coordinates is the center of the image
+ rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :]
+ new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center
+ # shift the rotation center to the new coordinates
+ rm[:, 2] += new_center
+ return rm
+
+ def inverse(self):
+ """
+ The inverse is to rotate it back with expand, and crop to get the original shape.
+ """
+ if not self.expand: # Not possible to inverse if a part of the image is lost
+ raise NotImplementedError()
+ rotation = RotationTransform(
+ self.bound_h, self.bound_w, -self.angle, True, None, self.interp
+ )
+ crop = CropTransform(
+ (rotation.bound_w - self.w) // 2, (rotation.bound_h - self.h) // 2, self.w, self.h
+ )
+ return TransformList([rotation, crop])
+
+
+class ColorTransform(Transform):
+ """
+ Generic wrapper for any photometric transforms.
+ These transformations should only affect the color space and
+ not the coordinate space of the image (e.g. annotation
+ coordinates such as bounding boxes should not be changed)
+ """
+
+ def __init__(self, op):
+ """
+ Args:
+ op (Callable): operation to be applied to the image,
+ which takes in an ndarray and returns an ndarray.
+ """
+ if not callable(op):
+ raise ValueError("op parameter should be callable")
+ super().__init__()
+ self._set_attributes(locals())
+
+ def apply_image(self, img):
+ return self.op(img)
+
+ def apply_coords(self, coords):
+ return coords
+
+ def inverse(self):
+ return NoOpTransform()
+
+ def apply_segmentation(self, segmentation):
+ return segmentation
+
+
+class PILColorTransform(ColorTransform):
+ """
+ Generic wrapper for PIL Photometric image transforms,
+ which affect the color space and not the coordinate
+ space of the image
+ """
+
+ def __init__(self, op):
+ """
+ Args:
+ op (Callable): operation to be applied to the image,
+ which takes in a PIL Image and returns a transformed
+ PIL Image.
+ For reference on possible operations see:
+ - https://pillow.readthedocs.io/en/stable/
+ """
+ if not callable(op):
+ raise ValueError("op parameter should be callable")
+ super().__init__(op)
+
+ def apply_image(self, img):
+ img = Image.fromarray(img)
+ return np.asarray(super().apply_image(img))
+
+
+def HFlip_rotated_box(transform, rotated_boxes):
+ """
+ Apply the horizontal flip transform on rotated boxes.
+
+ Args:
+ rotated_boxes (ndarray): Nx5 floating point array of
+ (x_center, y_center, width, height, angle_degrees) format
+ in absolute coordinates.
+ """
+ # Transform x_center
+ rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0]
+ # Transform angle
+ rotated_boxes[:, 4] = -rotated_boxes[:, 4]
+ return rotated_boxes
+
+
+def Resize_rotated_box(transform, rotated_boxes):
+ """
+ Apply the resizing transform on rotated boxes. For details of how these (approximation)
+ formulas are derived, please refer to :meth:`RotatedBoxes.scale`.
+
+ Args:
+ rotated_boxes (ndarray): Nx5 floating point array of
+ (x_center, y_center, width, height, angle_degrees) format
+ in absolute coordinates.
+ """
+ scale_factor_x = transform.new_w * 1.0 / transform.w
+ scale_factor_y = transform.new_h * 1.0 / transform.h
+ rotated_boxes[:, 0] *= scale_factor_x
+ rotated_boxes[:, 1] *= scale_factor_y
+ theta = rotated_boxes[:, 4] * np.pi / 180.0
+ c = np.cos(theta)
+ s = np.sin(theta)
+ rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s))
+ rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c))
+ rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi
+
+ return rotated_boxes
+
+
+HFlipTransform.register_type("rotated_box", HFlip_rotated_box)
+ResizeTransform.register_type("rotated_box", Resize_rotated_box)
+
+# not necessary any more with latest fvcore
+NoOpTransform.register_type("rotated_box", lambda t, x: x)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/engine/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e10640da12f4cad9b6a6a247162240827a1944
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/engine/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .launch import *
+from .train_loop import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+# prefer to let hooks and defaults live in separate namespaces (therefore not in __all__)
+# but still make them available here
+from .hooks import *
+from .defaults import *
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/engine/defaults.py b/comfyui_controlnet_aux/src/custom_detectron2/engine/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e3b1905b732f86ab9817e3671dc9a5be702e3e7
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/engine/defaults.py
@@ -0,0 +1,715 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+This file contains components with some default boilerplate logic user may need
+in training / testing. They will not work for everyone, but many users may find them useful.
+
+The behavior of functions/classes in this file is subject to change,
+since they are meant to represent the "common default behavior" people need in their projects.
+"""
+
+import argparse
+import logging
+import os
+import sys
+import weakref
+from collections import OrderedDict
+from typing import Optional
+import torch
+from fvcore.nn.precise_bn import get_bn_modules
+from omegaconf import OmegaConf
+from torch.nn.parallel import DistributedDataParallel
+
+import custom_detectron2.data.transforms as T
+from custom_detectron2.checkpoint import DetectionCheckpointer
+from custom_detectron2.config import CfgNode, LazyConfig
+from custom_detectron2.data import (
+ MetadataCatalog,
+ build_detection_test_loader,
+ build_detection_train_loader,
+)
+from custom_detectron2.evaluation import (
+ DatasetEvaluator,
+ inference_on_dataset,
+ print_csv_format,
+ verify_results,
+)
+from custom_detectron2.modeling import build_model
+from custom_detectron2.solver import build_lr_scheduler, build_optimizer
+from custom_detectron2.utils import comm
+from custom_detectron2.utils.collect_env import collect_env_info
+from custom_detectron2.utils.env import seed_all_rng
+from custom_detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+from custom_detectron2.utils.file_io import PathManager
+from custom_detectron2.utils.logger import setup_logger
+
+from . import hooks
+from .train_loop import AMPTrainer, SimpleTrainer, TrainerBase
+
+__all__ = [
+ "create_ddp_model",
+ "default_argument_parser",
+ "default_setup",
+ "default_writers",
+ "DefaultPredictor",
+ "DefaultTrainer",
+]
+
+
+def create_ddp_model(model, *, fp16_compression=False, **kwargs):
+ """
+ Create a DistributedDataParallel model if there are >1 processes.
+
+ Args:
+ model: a torch.nn.Module
+ fp16_compression: add fp16 compression hooks to the ddp object.
+ See more at https://pytorch.org/docs/stable/ddp_comm_hooks.html#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook
+ kwargs: other arguments of :module:`torch.nn.parallel.DistributedDataParallel`.
+ """ # noqa
+ if comm.get_world_size() == 1:
+ return model
+ if "device_ids" not in kwargs:
+ kwargs["device_ids"] = [comm.get_local_rank()]
+ ddp = DistributedDataParallel(model, **kwargs)
+ if fp16_compression:
+ from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks
+
+ ddp.register_comm_hook(state=None, hook=comm_hooks.fp16_compress_hook)
+ return ddp
+
+
+def default_argument_parser(epilog=None):
+ """
+ Create a parser with some common arguments used by detectron2 users.
+
+ Args:
+ epilog (str): epilog passed to ArgumentParser describing the usage.
+
+ Returns:
+ argparse.ArgumentParser:
+ """
+ parser = argparse.ArgumentParser(
+ epilog=epilog
+ or f"""
+Examples:
+
+Run on single machine:
+ $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml
+
+Change some config options:
+ $ {sys.argv[0]} --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth SOLVER.BASE_LR 0.001
+
+Run on multiple machines:
+ (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags]
+ (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags]
+""",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+ parser.add_argument(
+ "--resume",
+ action="store_true",
+ help="Whether to attempt to resume from the checkpoint directory. "
+ "See documentation of `DefaultTrainer.resume_or_load()` for what it means.",
+ )
+ parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+ parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
+ parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
+ parser.add_argument(
+ "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
+ )
+
+ # PyTorch still may leave orphan processes in multi-gpu training.
+ # Therefore we use a deterministic way to obtain port,
+ # so that users are aware of orphan processes by seeing the port occupied.
+ port = 2**15 + 2**14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2**14
+ parser.add_argument(
+ "--dist-url",
+ default="tcp://127.0.0.1:{}".format(port),
+ help="initialization URL for pytorch distributed backend. See "
+ "https://pytorch.org/docs/stable/distributed.html for details.",
+ )
+ parser.add_argument(
+ "opts",
+ help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+ """.strip(),
+ default=None,
+ nargs=argparse.REMAINDER,
+ )
+ return parser
+
+
+def _try_get_key(cfg, *keys, default=None):
+ """
+ Try select keys from cfg until the first key that exists. Otherwise return default.
+ """
+ if isinstance(cfg, CfgNode):
+ cfg = OmegaConf.create(cfg.dump())
+ for k in keys:
+ none = object()
+ p = OmegaConf.select(cfg, k, default=none)
+ if p is not none:
+ return p
+ return default
+
+
+def _highlight(code, filename):
+ try:
+ import pygments
+ except ImportError:
+ return code
+
+ from pygments.lexers import Python3Lexer, YamlLexer
+ from pygments.formatters import Terminal256Formatter
+
+ lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer()
+ code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai"))
+ return code
+
+
+def default_setup(cfg, args):
+ """
+ Perform some basic common setups at the beginning of a job, including:
+
+ 1. Set up the detectron2 logger
+ 2. Log basic information about environment, cmdline arguments, and config
+ 3. Backup the config to the output directory
+
+ Args:
+ cfg (CfgNode or omegaconf.DictConfig): the full config to be used
+ args (argparse.NameSpace): the command line arguments to be logged
+ """
+ output_dir = _try_get_key(cfg, "OUTPUT_DIR", "output_dir", "train.output_dir")
+ if comm.is_main_process() and output_dir:
+ PathManager.mkdirs(output_dir)
+
+ rank = comm.get_rank()
+ setup_logger(output_dir, distributed_rank=rank, name="fvcore")
+ logger = setup_logger(output_dir, distributed_rank=rank)
+
+ logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
+ logger.info("Environment info:\n" + collect_env_info())
+
+ logger.info("Command line arguments: " + str(args))
+ if hasattr(args, "config_file") and args.config_file != "":
+ logger.info(
+ "Contents of args.config_file={}:\n{}".format(
+ args.config_file,
+ _highlight(PathManager.open(args.config_file, "r").read(), args.config_file),
+ )
+ )
+
+ if comm.is_main_process() and output_dir:
+ # Note: some of our scripts may expect the existence of
+ # config.yaml in output directory
+ path = os.path.join(output_dir, "config.yaml")
+ if isinstance(cfg, CfgNode):
+ logger.info("Running with full config:\n{}".format(_highlight(cfg.dump(), ".yaml")))
+ with PathManager.open(path, "w") as f:
+ f.write(cfg.dump())
+ else:
+ LazyConfig.save(cfg, path)
+ logger.info("Full config saved to {}".format(path))
+
+ # make sure each worker has a different, yet deterministic seed if specified
+ seed = _try_get_key(cfg, "SEED", "train.seed", default=-1)
+ seed_all_rng(None if seed < 0 else seed + rank)
+
+ # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
+ # typical validation set.
+ if not (hasattr(args, "eval_only") and args.eval_only):
+ torch.backends.cudnn.benchmark = _try_get_key(
+ cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False
+ )
+
+
+def default_writers(output_dir: str, max_iter: Optional[int] = None):
+ """
+ Build a list of :class:`EventWriter` to be used.
+ It now consists of a :class:`CommonMetricPrinter`,
+ :class:`TensorboardXWriter` and :class:`JSONWriter`.
+
+ Args:
+ output_dir: directory to store JSON metrics and tensorboard events
+ max_iter: the total number of iterations
+
+ Returns:
+ list[EventWriter]: a list of :class:`EventWriter` objects.
+ """
+ PathManager.mkdirs(output_dir)
+ return [
+ # It may not always print what you want to see, since it prints "common" metrics only.
+ CommonMetricPrinter(max_iter),
+ JSONWriter(os.path.join(output_dir, "metrics.json")),
+ TensorboardXWriter(output_dir),
+ ]
+
+
+class DefaultPredictor:
+ """
+ Create a simple end-to-end predictor with the given config that runs on
+ single device for a single input image.
+
+ Compared to using the model directly, this class does the following additions:
+
+ 1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
+ 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
+ 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
+ 4. Take one input image and produce a single output, instead of a batch.
+
+ This is meant for simple demo purposes, so it does the above steps automatically.
+ This is not meant for benchmarks or running complicated inference logic.
+ If you'd like to do anything more complicated, please refer to its source code as
+ examples to build and use the model manually.
+
+ Attributes:
+ metadata (Metadata): the metadata of the underlying dataset, obtained from
+ cfg.DATASETS.TEST.
+
+ Examples:
+ ::
+ pred = DefaultPredictor(cfg)
+ inputs = cv2.imread("input.jpg")
+ outputs = pred(inputs)
+ """
+
+ def __init__(self, cfg):
+ self.cfg = cfg.clone() # cfg can be modified by model
+ self.model = build_model(self.cfg)
+ self.model.eval()
+ if len(cfg.DATASETS.TEST):
+ self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+ checkpointer = DetectionCheckpointer(self.model)
+ checkpointer.load(cfg.MODEL.WEIGHTS)
+
+ self.aug = T.ResizeShortestEdge(
+ [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+ )
+
+ self.input_format = cfg.INPUT.FORMAT
+ assert self.input_format in ["RGB", "BGR"], self.input_format
+
+ def __call__(self, original_image):
+ """
+ Args:
+ original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+
+ Returns:
+ predictions (dict):
+ the output of the model for one image only.
+ See :doc:`/tutorials/models` for details about the format.
+ """
+ with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258
+ # Apply pre-processing to image.
+ if self.input_format == "RGB":
+ # whether the model expects BGR inputs or RGB
+ original_image = original_image[:, :, ::-1]
+ height, width = original_image.shape[:2]
+ image = self.aug.get_transform(original_image).apply_image(original_image)
+ image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+
+ inputs = {"image": image, "height": height, "width": width}
+ predictions = self.model([inputs])[0]
+ return predictions
+
+
+class DefaultTrainer(TrainerBase):
+ """
+ A trainer with default training logic. It does the following:
+
+ 1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader
+ defined by the given config. Create a LR scheduler defined by the config.
+ 2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when
+ `resume_or_load` is called.
+ 3. Register a few common hooks defined by the config.
+
+ It is created to simplify the **standard model training workflow** and reduce code boilerplate
+ for users who only need the standard training workflow, with standard features.
+ It means this class makes *many assumptions* about your training logic that
+ may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+ :class:`SimpleTrainer` are too much for research.
+
+ The code of this class has been annotated about restrictive assumptions it makes.
+ When they do not work for you, you're encouraged to:
+
+ 1. Overwrite methods of this class, OR:
+ 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+ nothing else. You can then add your own hooks if needed. OR:
+ 3. Write your own training loop similar to `tools/plain_train_net.py`.
+
+ See the :doc:`/tutorials/training` tutorials for more details.
+
+ Note that the behavior of this class, like other functions/classes in
+ this file, is not stable, since it is meant to represent the "common default behavior".
+ It is only guaranteed to work well with the standard models and training workflow in detectron2.
+ To obtain more stable behavior, write your own training logic with other public APIs.
+
+ Examples:
+ ::
+ trainer = DefaultTrainer(cfg)
+ trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS
+ trainer.train()
+
+ Attributes:
+ scheduler:
+ checkpointer (DetectionCheckpointer):
+ cfg (CfgNode):
+ """
+
+ def __init__(self, cfg):
+ """
+ Args:
+ cfg (CfgNode):
+ """
+ super().__init__()
+ logger = logging.getLogger("detectron2")
+ if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2
+ setup_logger()
+ cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size())
+
+ # Assume these objects must be constructed in this order.
+ model = self.build_model(cfg)
+ optimizer = self.build_optimizer(cfg, model)
+ data_loader = self.build_train_loader(cfg)
+
+ model = create_ddp_model(model, broadcast_buffers=False)
+ self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
+ model, data_loader, optimizer
+ )
+
+ self.scheduler = self.build_lr_scheduler(cfg, optimizer)
+ self.checkpointer = DetectionCheckpointer(
+ # Assume you want to save checkpoints together with logs/statistics
+ model,
+ cfg.OUTPUT_DIR,
+ trainer=weakref.proxy(self),
+ )
+ self.start_iter = 0
+ self.max_iter = cfg.SOLVER.MAX_ITER
+ self.cfg = cfg
+
+ self.register_hooks(self.build_hooks())
+
+ def resume_or_load(self, resume=True):
+ """
+ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by
+ a `last_checkpoint` file), resume from the file. Resuming means loading all
+ available states (eg. optimizer and scheduler) and update iteration counter
+ from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used.
+
+ Otherwise, this is considered as an independent training. The method will load model
+ weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start
+ from iteration 0.
+
+ Args:
+ resume (bool): whether to do resume or not
+ """
+ self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
+ if resume and self.checkpointer.has_checkpoint():
+ # The checkpoint stores the training iteration that just finished, thus we start
+ # at the next iteration
+ self.start_iter = self.iter + 1
+
+ def build_hooks(self):
+ """
+ Build a list of default hooks, including timing, evaluation,
+ checkpointing, lr scheduling, precise BN, writing events.
+
+ Returns:
+ list[HookBase]:
+ """
+ cfg = self.cfg.clone()
+ cfg.defrost()
+ cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN
+
+ ret = [
+ hooks.IterationTimer(),
+ hooks.LRScheduler(),
+ hooks.PreciseBN(
+ # Run at the same freq as (but before) evaluation.
+ cfg.TEST.EVAL_PERIOD,
+ self.model,
+ # Build a new data loader to not affect training
+ self.build_train_loader(cfg),
+ cfg.TEST.PRECISE_BN.NUM_ITER,
+ )
+ if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+ else None,
+ ]
+
+ # Do PreciseBN before checkpointer, because it updates the model and need to
+ # be saved by checkpointer.
+ # This is not always the best: if checkpointing has a different frequency,
+ # some checkpoints may have more precise statistics than others.
+ if comm.is_main_process():
+ ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+
+ def test_and_save_results():
+ self._last_eval_results = self.test(self.cfg, self.model)
+ return self._last_eval_results
+
+ # Do evaluation after checkpointer, because then if it fails,
+ # we can use the saved checkpoint to debug.
+ ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+ if comm.is_main_process():
+ # Here the default print/log frequency of each writer is used.
+ # run writers in the end, so that evaluation metrics are written
+ ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+ return ret
+
+ def build_writers(self):
+ """
+ Build a list of writers to be used using :func:`default_writers()`.
+ If you'd like a different list of writers, you can overwrite it in
+ your trainer.
+
+ Returns:
+ list[EventWriter]: a list of :class:`EventWriter` objects.
+ """
+ return default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
+
+ def train(self):
+ """
+ Run training.
+
+ Returns:
+ OrderedDict of results, if evaluation is enabled. Otherwise None.
+ """
+ super().train(self.start_iter, self.max_iter)
+ if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
+ assert hasattr(
+ self, "_last_eval_results"
+ ), "No evaluation results obtained during training!"
+ verify_results(self.cfg, self._last_eval_results)
+ return self._last_eval_results
+
+ def run_step(self):
+ self._trainer.iter = self.iter
+ self._trainer.run_step()
+
+ def state_dict(self):
+ ret = super().state_dict()
+ ret["_trainer"] = self._trainer.state_dict()
+ return ret
+
+ def load_state_dict(self, state_dict):
+ super().load_state_dict(state_dict)
+ self._trainer.load_state_dict(state_dict["_trainer"])
+
+ @classmethod
+ def build_model(cls, cfg):
+ """
+ Returns:
+ torch.nn.Module:
+
+ It now calls :func:`detectron2.modeling.build_model`.
+ Overwrite it if you'd like a different model.
+ """
+ model = build_model(cfg)
+ logger = logging.getLogger(__name__)
+ logger.info("Model:\n{}".format(model))
+ return model
+
+ @classmethod
+ def build_optimizer(cls, cfg, model):
+ """
+ Returns:
+ torch.optim.Optimizer:
+
+ It now calls :func:`detectron2.solver.build_optimizer`.
+ Overwrite it if you'd like a different optimizer.
+ """
+ return build_optimizer(cfg, model)
+
+ @classmethod
+ def build_lr_scheduler(cls, cfg, optimizer):
+ """
+ It now calls :func:`detectron2.solver.build_lr_scheduler`.
+ Overwrite it if you'd like a different scheduler.
+ """
+ return build_lr_scheduler(cfg, optimizer)
+
+ @classmethod
+ def build_train_loader(cls, cfg):
+ """
+ Returns:
+ iterable
+
+ It now calls :func:`detectron2.data.build_detection_train_loader`.
+ Overwrite it if you'd like a different data loader.
+ """
+ return build_detection_train_loader(cfg)
+
+ @classmethod
+ def build_test_loader(cls, cfg, dataset_name):
+ """
+ Returns:
+ iterable
+
+ It now calls :func:`detectron2.data.build_detection_test_loader`.
+ Overwrite it if you'd like a different data loader.
+ """
+ return build_detection_test_loader(cfg, dataset_name)
+
+ @classmethod
+ def build_evaluator(cls, cfg, dataset_name):
+ """
+ Returns:
+ DatasetEvaluator or None
+
+ It is not implemented by default.
+ """
+ raise NotImplementedError(
+ """
+If you want DefaultTrainer to automatically run evaluation,
+please implement `build_evaluator()` in subclasses (see train_net.py for example).
+Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example).
+"""
+ )
+
+ @classmethod
+ def test(cls, cfg, model, evaluators=None):
+ """
+ Evaluate the given model. The given model is expected to already contain
+ weights to evaluate.
+
+ Args:
+ cfg (CfgNode):
+ model (nn.Module):
+ evaluators (list[DatasetEvaluator] or None): if None, will call
+ :meth:`build_evaluator`. Otherwise, must have the same length as
+ ``cfg.DATASETS.TEST``.
+
+ Returns:
+ dict: a dict of result metrics
+ """
+ logger = logging.getLogger(__name__)
+ if isinstance(evaluators, DatasetEvaluator):
+ evaluators = [evaluators]
+ if evaluators is not None:
+ assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+ len(cfg.DATASETS.TEST), len(evaluators)
+ )
+
+ results = OrderedDict()
+ for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+ data_loader = cls.build_test_loader(cfg, dataset_name)
+ # When evaluators are passed in as arguments,
+ # implicitly assume that evaluators can be created before data_loader.
+ if evaluators is not None:
+ evaluator = evaluators[idx]
+ else:
+ try:
+ evaluator = cls.build_evaluator(cfg, dataset_name)
+ except NotImplementedError:
+ logger.warn(
+ "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+ "or implement its `build_evaluator` method."
+ )
+ results[dataset_name] = {}
+ continue
+ results_i = inference_on_dataset(model, data_loader, evaluator)
+ results[dataset_name] = results_i
+ if comm.is_main_process():
+ assert isinstance(
+ results_i, dict
+ ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+ results_i
+ )
+ logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+ print_csv_format(results_i)
+
+ if len(results) == 1:
+ results = list(results.values())[0]
+ return results
+
+ @staticmethod
+ def auto_scale_workers(cfg, num_workers: int):
+ """
+ When the config is defined for certain number of workers (according to
+ ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of
+ workers currently in use, returns a new cfg where the total batch size
+ is scaled so that the per-GPU batch size stays the same as the
+ original ``IMS_PER_BATCH // REFERENCE_WORLD_SIZE``.
+
+ Other config options are also scaled accordingly:
+ * training steps and warmup steps are scaled inverse proportionally.
+ * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`.
+
+ For example, with the original config like the following:
+
+ .. code-block:: yaml
+
+ IMS_PER_BATCH: 16
+ BASE_LR: 0.1
+ REFERENCE_WORLD_SIZE: 8
+ MAX_ITER: 5000
+ STEPS: (4000,)
+ CHECKPOINT_PERIOD: 1000
+
+ When this config is used on 16 GPUs instead of the reference number 8,
+ calling this method will return a new config with:
+
+ .. code-block:: yaml
+
+ IMS_PER_BATCH: 32
+ BASE_LR: 0.2
+ REFERENCE_WORLD_SIZE: 16
+ MAX_ITER: 2500
+ STEPS: (2000,)
+ CHECKPOINT_PERIOD: 500
+
+ Note that both the original config and this new config can be trained on 16 GPUs.
+ It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``).
+
+ Returns:
+ CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``.
+ """
+ old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE
+ if old_world_size == 0 or old_world_size == num_workers:
+ return cfg
+ cfg = cfg.clone()
+ frozen = cfg.is_frozen()
+ cfg.defrost()
+
+ assert (
+ cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0
+ ), "Invalid REFERENCE_WORLD_SIZE in config!"
+ scale = num_workers / old_world_size
+ bs = cfg.SOLVER.IMS_PER_BATCH = int(round(cfg.SOLVER.IMS_PER_BATCH * scale))
+ lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale
+ max_iter = cfg.SOLVER.MAX_ITER = int(round(cfg.SOLVER.MAX_ITER / scale))
+ warmup_iter = cfg.SOLVER.WARMUP_ITERS = int(round(cfg.SOLVER.WARMUP_ITERS / scale))
+ cfg.SOLVER.STEPS = tuple(int(round(s / scale)) for s in cfg.SOLVER.STEPS)
+ cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale))
+ cfg.SOLVER.CHECKPOINT_PERIOD = int(round(cfg.SOLVER.CHECKPOINT_PERIOD / scale))
+ cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers # maintain invariant
+ logger = logging.getLogger(__name__)
+ logger.info(
+ f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, "
+ f"max_iter={max_iter}, warmup={warmup_iter}."
+ )
+
+ if frozen:
+ cfg.freeze()
+ return cfg
+
+
+# Access basic attributes from the underlying trainer
+for _attr in ["model", "data_loader", "optimizer"]:
+ setattr(
+ DefaultTrainer,
+ _attr,
+ property(
+ # getter
+ lambda self, x=_attr: getattr(self._trainer, x),
+ # setter
+ lambda self, value, x=_attr: setattr(self._trainer, x, value),
+ ),
+ )
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/engine/hooks.py b/comfyui_controlnet_aux/src/custom_detectron2/engine/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b75e264000df6bf93f173ac9eaf876ebe44c66
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/engine/hooks.py
@@ -0,0 +1,690 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import datetime
+import itertools
+import logging
+import math
+import operator
+import os
+import tempfile
+import time
+import warnings
+from collections import Counter
+import torch
+from fvcore.common.checkpoint import Checkpointer
+from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
+from fvcore.common.param_scheduler import ParamScheduler
+from fvcore.common.timer import Timer
+from fvcore.nn.precise_bn import get_bn_modules, update_bn_stats
+
+import custom_detectron2.utils.comm as comm
+from custom_detectron2.evaluation.testing import flatten_results_dict
+from custom_detectron2.solver import LRMultiplier
+from custom_detectron2.solver import LRScheduler as _LRScheduler
+from custom_detectron2.utils.events import EventStorage, EventWriter
+from custom_detectron2.utils.file_io import PathManager
+
+from .train_loop import HookBase
+
+__all__ = [
+ "CallbackHook",
+ "IterationTimer",
+ "PeriodicWriter",
+ "PeriodicCheckpointer",
+ "BestCheckpointer",
+ "LRScheduler",
+ "AutogradProfiler",
+ "EvalHook",
+ "PreciseBN",
+ "TorchProfiler",
+ "TorchMemoryStats",
+]
+
+
+"""
+Implement some common hooks.
+"""
+
+
+class CallbackHook(HookBase):
+ """
+ Create a hook using callback functions provided by the user.
+ """
+
+ def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
+ """
+ Each argument is a function that takes one argument: the trainer.
+ """
+ self._before_train = before_train
+ self._before_step = before_step
+ self._after_step = after_step
+ self._after_train = after_train
+
+ def before_train(self):
+ if self._before_train:
+ self._before_train(self.trainer)
+
+ def after_train(self):
+ if self._after_train:
+ self._after_train(self.trainer)
+ # The functions may be closures that hold reference to the trainer
+ # Therefore, delete them to avoid circular reference.
+ del self._before_train, self._after_train
+ del self._before_step, self._after_step
+
+ def before_step(self):
+ if self._before_step:
+ self._before_step(self.trainer)
+
+ def after_step(self):
+ if self._after_step:
+ self._after_step(self.trainer)
+
+
+class IterationTimer(HookBase):
+ """
+ Track the time spent for each iteration (each run_step call in the trainer).
+ Print a summary in the end of training.
+
+ This hook uses the time between the call to its :meth:`before_step`
+ and :meth:`after_step` methods.
+ Under the convention that :meth:`before_step` of all hooks should only
+ take negligible amount of time, the :class:`IterationTimer` hook should be
+ placed at the beginning of the list of hooks to obtain accurate timing.
+ """
+
+ def __init__(self, warmup_iter=3):
+ """
+ Args:
+ warmup_iter (int): the number of iterations at the beginning to exclude
+ from timing.
+ """
+ self._warmup_iter = warmup_iter
+ self._step_timer = Timer()
+ self._start_time = time.perf_counter()
+ self._total_timer = Timer()
+
+ def before_train(self):
+ self._start_time = time.perf_counter()
+ self._total_timer.reset()
+ self._total_timer.pause()
+
+ def after_train(self):
+ logger = logging.getLogger(__name__)
+ total_time = time.perf_counter() - self._start_time
+ total_time_minus_hooks = self._total_timer.seconds()
+ hook_time = total_time - total_time_minus_hooks
+
+ num_iter = self.trainer.storage.iter + 1 - self.trainer.start_iter - self._warmup_iter
+
+ if num_iter > 0 and total_time_minus_hooks > 0:
+ # Speed is meaningful only after warmup
+ # NOTE this format is parsed by grep in some scripts
+ logger.info(
+ "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
+ num_iter,
+ str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
+ total_time_minus_hooks / num_iter,
+ )
+ )
+
+ logger.info(
+ "Total training time: {} ({} on hooks)".format(
+ str(datetime.timedelta(seconds=int(total_time))),
+ str(datetime.timedelta(seconds=int(hook_time))),
+ )
+ )
+
+ def before_step(self):
+ self._step_timer.reset()
+ self._total_timer.resume()
+
+ def after_step(self):
+ # +1 because we're in after_step, the current step is done
+ # but not yet counted
+ iter_done = self.trainer.storage.iter - self.trainer.start_iter + 1
+ if iter_done >= self._warmup_iter:
+ sec = self._step_timer.seconds()
+ self.trainer.storage.put_scalars(time=sec)
+ else:
+ self._start_time = time.perf_counter()
+ self._total_timer.reset()
+
+ self._total_timer.pause()
+
+
+class PeriodicWriter(HookBase):
+ """
+ Write events to EventStorage (by calling ``writer.write()``) periodically.
+
+ It is executed every ``period`` iterations and after the last iteration.
+ Note that ``period`` does not affect how data is smoothed by each writer.
+ """
+
+ def __init__(self, writers, period=20):
+ """
+ Args:
+ writers (list[EventWriter]): a list of EventWriter objects
+ period (int):
+ """
+ self._writers = writers
+ for w in writers:
+ assert isinstance(w, EventWriter), w
+ self._period = period
+
+ def after_step(self):
+ if (self.trainer.iter + 1) % self._period == 0 or (
+ self.trainer.iter == self.trainer.max_iter - 1
+ ):
+ for writer in self._writers:
+ writer.write()
+
+ def after_train(self):
+ for writer in self._writers:
+ # If any new data is found (e.g. produced by other after_train),
+ # write them before closing
+ writer.write()
+ writer.close()
+
+
+class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
+ """
+ Same as :class:`detectron2.checkpoint.PeriodicCheckpointer`, but as a hook.
+
+ Note that when used as a hook,
+ it is unable to save additional data other than what's defined
+ by the given `checkpointer`.
+
+ It is executed every ``period`` iterations and after the last iteration.
+ """
+
+ def before_train(self):
+ self.max_iter = self.trainer.max_iter
+
+ def after_step(self):
+ # No way to use **kwargs
+ self.step(self.trainer.iter)
+
+
+class BestCheckpointer(HookBase):
+ """
+ Checkpoints best weights based off given metric.
+
+ This hook should be used in conjunction to and executed after the hook
+ that produces the metric, e.g. `EvalHook`.
+ """
+
+ def __init__(
+ self,
+ eval_period: int,
+ checkpointer: Checkpointer,
+ val_metric: str,
+ mode: str = "max",
+ file_prefix: str = "model_best",
+ ) -> None:
+ """
+ Args:
+ eval_period (int): the period `EvalHook` is set to run.
+ checkpointer: the checkpointer object used to save checkpoints.
+ val_metric (str): validation metric to track for best checkpoint, e.g. "bbox/AP50"
+ mode (str): one of {'max', 'min'}. controls whether the chosen val metric should be
+ maximized or minimized, e.g. for "bbox/AP50" it should be "max"
+ file_prefix (str): the prefix of checkpoint's filename, defaults to "model_best"
+ """
+ self._logger = logging.getLogger(__name__)
+ self._period = eval_period
+ self._val_metric = val_metric
+ assert mode in [
+ "max",
+ "min",
+ ], f'Mode "{mode}" to `BestCheckpointer` is unknown. It should be one of {"max", "min"}.'
+ if mode == "max":
+ self._compare = operator.gt
+ else:
+ self._compare = operator.lt
+ self._checkpointer = checkpointer
+ self._file_prefix = file_prefix
+ self.best_metric = None
+ self.best_iter = None
+
+ def _update_best(self, val, iteration):
+ if math.isnan(val) or math.isinf(val):
+ return False
+ self.best_metric = val
+ self.best_iter = iteration
+ return True
+
+ def _best_checking(self):
+ metric_tuple = self.trainer.storage.latest().get(self._val_metric)
+ if metric_tuple is None:
+ self._logger.warning(
+ f"Given val metric {self._val_metric} does not seem to be computed/stored."
+ "Will not be checkpointing based on it."
+ )
+ return
+ else:
+ latest_metric, metric_iter = metric_tuple
+
+ if self.best_metric is None:
+ if self._update_best(latest_metric, metric_iter):
+ additional_state = {"iteration": metric_iter}
+ self._checkpointer.save(f"{self._file_prefix}", **additional_state)
+ self._logger.info(
+ f"Saved first model at {self.best_metric:0.5f} @ {self.best_iter} steps"
+ )
+ elif self._compare(latest_metric, self.best_metric):
+ additional_state = {"iteration": metric_iter}
+ self._checkpointer.save(f"{self._file_prefix}", **additional_state)
+ self._logger.info(
+ f"Saved best model as latest eval score for {self._val_metric} is "
+ f"{latest_metric:0.5f}, better than last best score "
+ f"{self.best_metric:0.5f} @ iteration {self.best_iter}."
+ )
+ self._update_best(latest_metric, metric_iter)
+ else:
+ self._logger.info(
+ f"Not saving as latest eval score for {self._val_metric} is {latest_metric:0.5f}, "
+ f"not better than best score {self.best_metric:0.5f} @ iteration {self.best_iter}."
+ )
+
+ def after_step(self):
+ # same conditions as `EvalHook`
+ next_iter = self.trainer.iter + 1
+ if (
+ self._period > 0
+ and next_iter % self._period == 0
+ and next_iter != self.trainer.max_iter
+ ):
+ self._best_checking()
+
+ def after_train(self):
+ # same conditions as `EvalHook`
+ if self.trainer.iter + 1 >= self.trainer.max_iter:
+ self._best_checking()
+
+
+class LRScheduler(HookBase):
+ """
+ A hook which executes a torch builtin LR scheduler and summarizes the LR.
+ It is executed after every iteration.
+ """
+
+ def __init__(self, optimizer=None, scheduler=None):
+ """
+ Args:
+ optimizer (torch.optim.Optimizer):
+ scheduler (torch.optim.LRScheduler or fvcore.common.param_scheduler.ParamScheduler):
+ if a :class:`ParamScheduler` object, it defines the multiplier over the base LR
+ in the optimizer.
+
+ If any argument is not given, will try to obtain it from the trainer.
+ """
+ self._optimizer = optimizer
+ self._scheduler = scheduler
+
+ def before_train(self):
+ self._optimizer = self._optimizer or self.trainer.optimizer
+ if isinstance(self.scheduler, ParamScheduler):
+ self._scheduler = LRMultiplier(
+ self._optimizer,
+ self.scheduler,
+ self.trainer.max_iter,
+ last_iter=self.trainer.iter - 1,
+ )
+ self._best_param_group_id = LRScheduler.get_best_param_group_id(self._optimizer)
+
+ @staticmethod
+ def get_best_param_group_id(optimizer):
+ # NOTE: some heuristics on what LR to summarize
+ # summarize the param group with most parameters
+ largest_group = max(len(g["params"]) for g in optimizer.param_groups)
+
+ if largest_group == 1:
+ # If all groups have one parameter,
+ # then find the most common initial LR, and use it for summary
+ lr_count = Counter([g["lr"] for g in optimizer.param_groups])
+ lr = lr_count.most_common()[0][0]
+ for i, g in enumerate(optimizer.param_groups):
+ if g["lr"] == lr:
+ return i
+ else:
+ for i, g in enumerate(optimizer.param_groups):
+ if len(g["params"]) == largest_group:
+ return i
+
+ def after_step(self):
+ lr = self._optimizer.param_groups[self._best_param_group_id]["lr"]
+ self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
+ self.scheduler.step()
+
+ @property
+ def scheduler(self):
+ return self._scheduler or self.trainer.scheduler
+
+ def state_dict(self):
+ if isinstance(self.scheduler, _LRScheduler):
+ return self.scheduler.state_dict()
+ return {}
+
+ def load_state_dict(self, state_dict):
+ if isinstance(self.scheduler, _LRScheduler):
+ logger = logging.getLogger(__name__)
+ logger.info("Loading scheduler from state_dict ...")
+ self.scheduler.load_state_dict(state_dict)
+
+
+class TorchProfiler(HookBase):
+ """
+ A hook which runs `torch.profiler.profile`.
+
+ Examples:
+ ::
+ hooks.TorchProfiler(
+ lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR
+ )
+
+ The above example will run the profiler for iteration 10~20 and dump
+ results to ``OUTPUT_DIR``. We did not profile the first few iterations
+ because they are typically slower than the rest.
+ The result files can be loaded in the ``chrome://tracing`` page in chrome browser,
+ and the tensorboard visualizations can be visualized using
+ ``tensorboard --logdir OUTPUT_DIR/log``
+ """
+
+ def __init__(self, enable_predicate, output_dir, *, activities=None, save_tensorboard=True):
+ """
+ Args:
+ enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+ and returns whether to enable the profiler.
+ It will be called once every step, and can be used to select which steps to profile.
+ output_dir (str): the output directory to dump tracing files.
+ activities (iterable): same as in `torch.profiler.profile`.
+ save_tensorboard (bool): whether to save tensorboard visualizations at (output_dir)/log/
+ """
+ self._enable_predicate = enable_predicate
+ self._activities = activities
+ self._output_dir = output_dir
+ self._save_tensorboard = save_tensorboard
+
+ def before_step(self):
+ if self._enable_predicate(self.trainer):
+ if self._save_tensorboard:
+ on_trace_ready = torch.profiler.tensorboard_trace_handler(
+ os.path.join(
+ self._output_dir,
+ "log",
+ "profiler-tensorboard-iter{}".format(self.trainer.iter),
+ ),
+ f"worker{comm.get_rank()}",
+ )
+ else:
+ on_trace_ready = None
+ self._profiler = torch.profiler.profile(
+ activities=self._activities,
+ on_trace_ready=on_trace_ready,
+ record_shapes=True,
+ profile_memory=True,
+ with_stack=True,
+ with_flops=True,
+ )
+ self._profiler.__enter__()
+ else:
+ self._profiler = None
+
+ def after_step(self):
+ if self._profiler is None:
+ return
+ self._profiler.__exit__(None, None, None)
+ if not self._save_tensorboard:
+ PathManager.mkdirs(self._output_dir)
+ out_file = os.path.join(
+ self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter)
+ )
+ if "://" not in out_file:
+ self._profiler.export_chrome_trace(out_file)
+ else:
+ # Support non-posix filesystems
+ with tempfile.TemporaryDirectory(prefix="detectron2_profiler") as d:
+ tmp_file = os.path.join(d, "tmp.json")
+ self._profiler.export_chrome_trace(tmp_file)
+ with open(tmp_file) as f:
+ content = f.read()
+ with PathManager.open(out_file, "w") as f:
+ f.write(content)
+
+
+class AutogradProfiler(TorchProfiler):
+ """
+ A hook which runs `torch.autograd.profiler.profile`.
+
+ Examples:
+ ::
+ hooks.AutogradProfiler(
+ lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR
+ )
+
+ The above example will run the profiler for iteration 10~20 and dump
+ results to ``OUTPUT_DIR``. We did not profile the first few iterations
+ because they are typically slower than the rest.
+ The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
+
+ Note:
+ When used together with NCCL on older version of GPUs,
+ autograd profiler may cause deadlock because it unnecessarily allocates
+ memory on every device it sees. The memory management calls, if
+ interleaved with NCCL calls, lead to deadlock on GPUs that do not
+ support ``cudaLaunchCooperativeKernelMultiDevice``.
+ """
+
+ def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+ """
+ Args:
+ enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+ and returns whether to enable the profiler.
+ It will be called once every step, and can be used to select which steps to profile.
+ output_dir (str): the output directory to dump tracing files.
+ use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+ """
+ warnings.warn("AutogradProfiler has been deprecated in favor of TorchProfiler.")
+ self._enable_predicate = enable_predicate
+ self._use_cuda = use_cuda
+ self._output_dir = output_dir
+
+ def before_step(self):
+ if self._enable_predicate(self.trainer):
+ self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+ self._profiler.__enter__()
+ else:
+ self._profiler = None
+
+
+class EvalHook(HookBase):
+ """
+ Run an evaluation function periodically, and at the end of training.
+
+ It is executed every ``eval_period`` iterations and after the last iteration.
+ """
+
+ def __init__(self, eval_period, eval_function, eval_after_train=True):
+ """
+ Args:
+ eval_period (int): the period to run `eval_function`. Set to 0 to
+ not evaluate periodically (but still evaluate after the last iteration
+ if `eval_after_train` is True).
+ eval_function (callable): a function which takes no arguments, and
+ returns a nested dict of evaluation metrics.
+ eval_after_train (bool): whether to evaluate after the last iteration
+
+ Note:
+ This hook must be enabled in all or none workers.
+ If you would like only certain workers to perform evaluation,
+ give other workers a no-op function (`eval_function=lambda: None`).
+ """
+ self._period = eval_period
+ self._func = eval_function
+ self._eval_after_train = eval_after_train
+
+ def _do_eval(self):
+ results = self._func()
+
+ if results:
+ assert isinstance(
+ results, dict
+ ), "Eval function must return a dict. Got {} instead.".format(results)
+
+ flattened_results = flatten_results_dict(results)
+ for k, v in flattened_results.items():
+ try:
+ v = float(v)
+ except Exception as e:
+ raise ValueError(
+ "[EvalHook] eval_function should return a nested dict of float. "
+ "Got '{}: {}' instead.".format(k, v)
+ ) from e
+ self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
+
+ # Evaluation may take different time among workers.
+ # A barrier make them start the next iteration together.
+ comm.synchronize()
+
+ def after_step(self):
+ next_iter = self.trainer.iter + 1
+ if self._period > 0 and next_iter % self._period == 0:
+ # do the last eval in after_train
+ if next_iter != self.trainer.max_iter:
+ self._do_eval()
+
+ def after_train(self):
+ # This condition is to prevent the eval from running after a failed training
+ if self._eval_after_train and self.trainer.iter + 1 >= self.trainer.max_iter:
+ self._do_eval()
+ # func is likely a closure that holds reference to the trainer
+ # therefore we clean it to avoid circular reference in the end
+ del self._func
+
+
+class PreciseBN(HookBase):
+ """
+ The standard implementation of BatchNorm uses EMA in inference, which is
+ sometimes suboptimal.
+ This class computes the true average of statistics rather than the moving average,
+ and put true averages to every BN layer in the given model.
+
+ It is executed every ``period`` iterations and after the last iteration.
+ """
+
+ def __init__(self, period, model, data_loader, num_iter):
+ """
+ Args:
+ period (int): the period this hook is run, or 0 to not run during training.
+ The hook will always run in the end of training.
+ model (nn.Module): a module whose all BN layers in training mode will be
+ updated by precise BN.
+ Note that user is responsible for ensuring the BN layers to be
+ updated are in training mode when this hook is triggered.
+ data_loader (iterable): it will produce data to be run by `model(data)`.
+ num_iter (int): number of iterations used to compute the precise
+ statistics.
+ """
+ self._logger = logging.getLogger(__name__)
+ if len(get_bn_modules(model)) == 0:
+ self._logger.info(
+ "PreciseBN is disabled because model does not contain BN layers in training mode."
+ )
+ self._disabled = True
+ return
+
+ self._model = model
+ self._data_loader = data_loader
+ self._num_iter = num_iter
+ self._period = period
+ self._disabled = False
+
+ self._data_iter = None
+
+ def after_step(self):
+ next_iter = self.trainer.iter + 1
+ is_final = next_iter == self.trainer.max_iter
+ if is_final or (self._period > 0 and next_iter % self._period == 0):
+ self.update_stats()
+
+ def update_stats(self):
+ """
+ Update the model with precise statistics. Users can manually call this method.
+ """
+ if self._disabled:
+ return
+
+ if self._data_iter is None:
+ self._data_iter = iter(self._data_loader)
+
+ def data_loader():
+ for num_iter in itertools.count(1):
+ if num_iter % 100 == 0:
+ self._logger.info(
+ "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter)
+ )
+ # This way we can reuse the same iterator
+ yield next(self._data_iter)
+
+ with EventStorage(): # capture events in a new storage to discard them
+ self._logger.info(
+ "Running precise-BN for {} iterations... ".format(self._num_iter)
+ + "Note that this could produce different statistics every time."
+ )
+ update_bn_stats(self._model, data_loader(), self._num_iter)
+
+
+class TorchMemoryStats(HookBase):
+ """
+ Writes pytorch's cuda memory statistics periodically.
+ """
+
+ def __init__(self, period=20, max_runs=10):
+ """
+ Args:
+ period (int): Output stats each 'period' iterations
+ max_runs (int): Stop the logging after 'max_runs'
+ """
+
+ self._logger = logging.getLogger(__name__)
+ self._period = period
+ self._max_runs = max_runs
+ self._runs = 0
+
+ def after_step(self):
+ if self._runs > self._max_runs:
+ return
+
+ if (self.trainer.iter + 1) % self._period == 0 or (
+ self.trainer.iter == self.trainer.max_iter - 1
+ ):
+ if torch.cuda.is_available():
+ max_reserved_mb = torch.cuda.max_memory_reserved() / 1024.0 / 1024.0
+ reserved_mb = torch.cuda.memory_reserved() / 1024.0 / 1024.0
+ max_allocated_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
+ allocated_mb = torch.cuda.memory_allocated() / 1024.0 / 1024.0
+
+ self._logger.info(
+ (
+ " iter: {} "
+ " max_reserved_mem: {:.0f}MB "
+ " reserved_mem: {:.0f}MB "
+ " max_allocated_mem: {:.0f}MB "
+ " allocated_mem: {:.0f}MB "
+ ).format(
+ self.trainer.iter,
+ max_reserved_mb,
+ reserved_mb,
+ max_allocated_mb,
+ allocated_mb,
+ )
+ )
+
+ self._runs += 1
+ if self._runs == self._max_runs:
+ mem_summary = torch.cuda.memory_summary()
+ self._logger.info("\n" + mem_summary)
+
+ torch.cuda.reset_peak_memory_stats()
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/engine/launch.py b/comfyui_controlnet_aux/src/custom_detectron2/engine/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e056d5151c4c9d3fd3914051fc234df9fed02086
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/engine/launch.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from datetime import timedelta
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from custom_detectron2.utils import comm
+
+__all__ = ["DEFAULT_TIMEOUT", "launch"]
+
+DEFAULT_TIMEOUT = timedelta(minutes=30)
+
+
+def _find_free_port():
+ import socket
+
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ # Binding to port 0 will cause the OS to find an available port for us
+ sock.bind(("", 0))
+ port = sock.getsockname()[1]
+ sock.close()
+ # NOTE: there is still a chance the port could be taken by other processes.
+ return port
+
+
+def launch(
+ main_func,
+ # Should be num_processes_per_machine, but kept for compatibility.
+ num_gpus_per_machine,
+ num_machines=1,
+ machine_rank=0,
+ dist_url=None,
+ args=(),
+ timeout=DEFAULT_TIMEOUT,
+):
+ """
+ Launch multi-process or distributed training.
+ This function must be called on all machines involved in the training.
+ It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine.
+
+ Args:
+ main_func: a function that will be called by `main_func(*args)`
+ num_gpus_per_machine (int): number of processes per machine. When
+ using GPUs, this should be the number of GPUs.
+ num_machines (int): the total number of machines
+ machine_rank (int): the rank of this machine
+ dist_url (str): url to connect to for distributed jobs, including protocol
+ e.g. "tcp://127.0.0.1:8686".
+ Can be set to "auto" to automatically select a free port on localhost
+ timeout (timedelta): timeout of the distributed workers
+ args (tuple): arguments passed to main_func
+ """
+ world_size = num_machines * num_gpus_per_machine
+ if world_size > 1:
+ # https://github.com/pytorch/pytorch/pull/14391
+ # TODO prctl in spawned processes
+
+ if dist_url == "auto":
+ assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs."
+ port = _find_free_port()
+ dist_url = f"tcp://127.0.0.1:{port}"
+ if num_machines > 1 and dist_url.startswith("file://"):
+ logger = logging.getLogger(__name__)
+ logger.warning(
+ "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://"
+ )
+
+ mp.start_processes(
+ _distributed_worker,
+ nprocs=num_gpus_per_machine,
+ args=(
+ main_func,
+ world_size,
+ num_gpus_per_machine,
+ machine_rank,
+ dist_url,
+ args,
+ timeout,
+ ),
+ daemon=False,
+ )
+ else:
+ main_func(*args)
+
+
+def _distributed_worker(
+ local_rank,
+ main_func,
+ world_size,
+ num_gpus_per_machine,
+ machine_rank,
+ dist_url,
+ args,
+ timeout=DEFAULT_TIMEOUT,
+):
+ has_gpu = torch.cuda.is_available()
+ if has_gpu:
+ assert num_gpus_per_machine <= torch.cuda.device_count()
+ global_rank = machine_rank * num_gpus_per_machine + local_rank
+ try:
+ dist.init_process_group(
+ backend="NCCL" if has_gpu else "GLOO",
+ init_method=dist_url,
+ world_size=world_size,
+ rank=global_rank,
+ timeout=timeout,
+ )
+ except Exception as e:
+ logger = logging.getLogger(__name__)
+ logger.error("Process group URL: {}".format(dist_url))
+ raise e
+
+ # Setup the local process group.
+ comm.create_local_process_group(num_gpus_per_machine)
+ if has_gpu:
+ torch.cuda.set_device(local_rank)
+
+ # synchronize is needed here to prevent a possible timeout after calling init_process_group
+ # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+ comm.synchronize()
+
+ main_func(*args)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/engine/train_loop.py b/comfyui_controlnet_aux/src/custom_detectron2/engine/train_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..28a214968d220578b34e3cda641a63a2cf140dd5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/engine/train_loop.py
@@ -0,0 +1,469 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+import numpy as np
+import time
+import weakref
+from typing import List, Mapping, Optional
+import torch
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+import custom_detectron2.utils.comm as comm
+from custom_detectron2.utils.events import EventStorage, get_event_storage
+from custom_detectron2.utils.logger import _log_api_usage
+
+__all__ = ["HookBase", "TrainerBase", "SimpleTrainer", "AMPTrainer"]
+
+
+class HookBase:
+ """
+ Base class for hooks that can be registered with :class:`TrainerBase`.
+
+ Each hook can implement 4 methods. The way they are called is demonstrated
+ in the following snippet:
+ ::
+ hook.before_train()
+ for iter in range(start_iter, max_iter):
+ hook.before_step()
+ trainer.run_step()
+ hook.after_step()
+ iter += 1
+ hook.after_train()
+
+ Notes:
+ 1. In the hook method, users can access ``self.trainer`` to access more
+ properties about the context (e.g., model, current iteration, or config
+ if using :class:`DefaultTrainer`).
+
+ 2. A hook that does something in :meth:`before_step` can often be
+ implemented equivalently in :meth:`after_step`.
+ If the hook takes non-trivial time, it is strongly recommended to
+ implement the hook in :meth:`after_step` instead of :meth:`before_step`.
+ The convention is that :meth:`before_step` should only take negligible time.
+
+ Following this convention will allow hooks that do care about the difference
+ between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
+ function properly.
+
+ """
+
+ trainer: "TrainerBase" = None
+ """
+ A weak reference to the trainer object. Set by the trainer when the hook is registered.
+ """
+
+ def before_train(self):
+ """
+ Called before the first iteration.
+ """
+ pass
+
+ def after_train(self):
+ """
+ Called after the last iteration.
+ """
+ pass
+
+ def before_step(self):
+ """
+ Called before each iteration.
+ """
+ pass
+
+ def after_backward(self):
+ """
+ Called after the backward pass of each iteration.
+ """
+ pass
+
+ def after_step(self):
+ """
+ Called after each iteration.
+ """
+ pass
+
+ def state_dict(self):
+ """
+ Hooks are stateless by default, but can be made checkpointable by
+ implementing `state_dict` and `load_state_dict`.
+ """
+ return {}
+
+
+class TrainerBase:
+ """
+ Base class for iterative trainer with hooks.
+
+ The only assumption we made here is: the training runs in a loop.
+ A subclass can implement what the loop is.
+ We made no assumptions about the existence of dataloader, optimizer, model, etc.
+
+ Attributes:
+ iter(int): the current iteration.
+
+ start_iter(int): The iteration to start with.
+ By convention the minimum possible value is 0.
+
+ max_iter(int): The iteration to end training.
+
+ storage(EventStorage): An EventStorage that's opened during the course of training.
+ """
+
+ def __init__(self) -> None:
+ self._hooks: List[HookBase] = []
+ self.iter: int = 0
+ self.start_iter: int = 0
+ self.max_iter: int
+ self.storage: EventStorage
+ _log_api_usage("trainer." + self.__class__.__name__)
+
+ def register_hooks(self, hooks: List[Optional[HookBase]]) -> None:
+ """
+ Register hooks to the trainer. The hooks are executed in the order
+ they are registered.
+
+ Args:
+ hooks (list[Optional[HookBase]]): list of hooks
+ """
+ hooks = [h for h in hooks if h is not None]
+ for h in hooks:
+ assert isinstance(h, HookBase)
+ # To avoid circular reference, hooks and trainer cannot own each other.
+ # This normally does not matter, but will cause memory leak if the
+ # involved objects contain __del__:
+ # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
+ h.trainer = weakref.proxy(self)
+ self._hooks.extend(hooks)
+
+ def train(self, start_iter: int, max_iter: int):
+ """
+ Args:
+ start_iter, max_iter (int): See docs above
+ """
+ logger = logging.getLogger(__name__)
+ logger.info("Starting training from iteration {}".format(start_iter))
+
+ self.iter = self.start_iter = start_iter
+ self.max_iter = max_iter
+
+ with EventStorage(start_iter) as self.storage:
+ try:
+ self.before_train()
+ for self.iter in range(start_iter, max_iter):
+ self.before_step()
+ self.run_step()
+ self.after_step()
+ # self.iter == max_iter can be used by `after_train` to
+ # tell whether the training successfully finished or failed
+ # due to exceptions.
+ self.iter += 1
+ except Exception:
+ logger.exception("Exception during training:")
+ raise
+ finally:
+ self.after_train()
+
+ def before_train(self):
+ for h in self._hooks:
+ h.before_train()
+
+ def after_train(self):
+ self.storage.iter = self.iter
+ for h in self._hooks:
+ h.after_train()
+
+ def before_step(self):
+ # Maintain the invariant that storage.iter == trainer.iter
+ # for the entire execution of each step
+ self.storage.iter = self.iter
+
+ for h in self._hooks:
+ h.before_step()
+
+ def after_backward(self):
+ for h in self._hooks:
+ h.after_backward()
+
+ def after_step(self):
+ for h in self._hooks:
+ h.after_step()
+
+ def run_step(self):
+ raise NotImplementedError
+
+ def state_dict(self):
+ ret = {"iteration": self.iter}
+ hooks_state = {}
+ for h in self._hooks:
+ sd = h.state_dict()
+ if sd:
+ name = type(h).__qualname__
+ if name in hooks_state:
+ # TODO handle repetitive stateful hooks
+ continue
+ hooks_state[name] = sd
+ if hooks_state:
+ ret["hooks"] = hooks_state
+ return ret
+
+ def load_state_dict(self, state_dict):
+ logger = logging.getLogger(__name__)
+ self.iter = state_dict["iteration"]
+ for key, value in state_dict.get("hooks", {}).items():
+ for h in self._hooks:
+ try:
+ name = type(h).__qualname__
+ except AttributeError:
+ continue
+ if name == key:
+ h.load_state_dict(value)
+ break
+ else:
+ logger.warning(f"Cannot find the hook '{key}', its state_dict is ignored.")
+
+
+class SimpleTrainer(TrainerBase):
+ """
+ A simple trainer for the most common type of task:
+ single-cost single-optimizer single-data-source iterative optimization,
+ optionally using data-parallelism.
+ It assumes that every step, you:
+
+ 1. Compute the loss with a data from the data_loader.
+ 2. Compute the gradients with the above loss.
+ 3. Update the model with the optimizer.
+
+ All other tasks during training (checkpointing, logging, evaluation, LR schedule)
+ are maintained by hooks, which can be registered by :meth:`TrainerBase.register_hooks`.
+
+ If you want to do anything fancier than this,
+ either subclass TrainerBase and implement your own `run_step`,
+ or write your own training loop.
+ """
+
+ def __init__(self, model, data_loader, optimizer, gather_metric_period=1):
+ """
+ Args:
+ model: a torch Module. Takes a data from data_loader and returns a
+ dict of losses.
+ data_loader: an iterable. Contains data to be used to call model.
+ optimizer: a torch optimizer.
+ gather_metric_period: an int. Every gather_metric_period iterations
+ the metrics are gathered from all the ranks to rank 0 and logged.
+ """
+ super().__init__()
+
+ """
+ We set the model to training mode in the trainer.
+ However it's valid to train a model that's in eval mode.
+ If you want your model (or a submodule of it) to behave
+ like evaluation during training, you can overwrite its train() method.
+ """
+ model.train()
+
+ self.model = model
+ self.data_loader = data_loader
+ # to access the data loader iterator, call `self._data_loader_iter`
+ self._data_loader_iter_obj = None
+ self.optimizer = optimizer
+ self.gather_metric_period = gather_metric_period
+
+ def run_step(self):
+ """
+ Implement the standard training logic described above.
+ """
+ assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+ start = time.perf_counter()
+ """
+ If you want to do something with the data, you can wrap the dataloader.
+ """
+ data = next(self._data_loader_iter)
+ data_time = time.perf_counter() - start
+
+ """
+ If you want to do something with the losses, you can wrap the model.
+ """
+ loss_dict = self.model(data)
+ if isinstance(loss_dict, torch.Tensor):
+ losses = loss_dict
+ loss_dict = {"total_loss": loss_dict}
+ else:
+ losses = sum(loss_dict.values())
+
+ """
+ If you need to accumulate gradients or do something similar, you can
+ wrap the optimizer with your custom `zero_grad()` method.
+ """
+ self.optimizer.zero_grad()
+ losses.backward()
+
+ self.after_backward()
+
+ self._write_metrics(loss_dict, data_time)
+
+ """
+ If you need gradient clipping/scaling or other processing, you can
+ wrap the optimizer with your custom `step()` method. But it is
+ suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4
+ """
+ self.optimizer.step()
+
+ @property
+ def _data_loader_iter(self):
+ # only create the data loader iterator when it is used
+ if self._data_loader_iter_obj is None:
+ self._data_loader_iter_obj = iter(self.data_loader)
+ return self._data_loader_iter_obj
+
+ def reset_data_loader(self, data_loader_builder):
+ """
+ Delete and replace the current data loader with a new one, which will be created
+ by calling `data_loader_builder` (without argument).
+ """
+ del self.data_loader
+ data_loader = data_loader_builder()
+ self.data_loader = data_loader
+ self._data_loader_iter_obj = None
+
+ def _write_metrics(
+ self,
+ loss_dict: Mapping[str, torch.Tensor],
+ data_time: float,
+ prefix: str = "",
+ ) -> None:
+ if (self.iter + 1) % self.gather_metric_period == 0:
+ SimpleTrainer.write_metrics(loss_dict, data_time, prefix)
+
+ @staticmethod
+ def write_metrics(
+ loss_dict: Mapping[str, torch.Tensor],
+ data_time: float,
+ prefix: str = "",
+ ) -> None:
+ """
+ Args:
+ loss_dict (dict): dict of scalar losses
+ data_time (float): time taken by the dataloader iteration
+ prefix (str): prefix for logging keys
+ """
+ metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()}
+ metrics_dict["data_time"] = data_time
+
+ # Gather metrics among all workers for logging
+ # This assumes we do DDP-style training, which is currently the only
+ # supported method in detectron2.
+ all_metrics_dict = comm.gather(metrics_dict)
+
+ if comm.is_main_process():
+ storage = get_event_storage()
+
+ # data_time among workers can have high variance. The actual latency
+ # caused by data_time is the maximum among workers.
+ data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
+ storage.put_scalar("data_time", data_time)
+
+ # average the rest metrics
+ metrics_dict = {
+ k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
+ }
+ total_losses_reduced = sum(metrics_dict.values())
+ if not np.isfinite(total_losses_reduced):
+ raise FloatingPointError(
+ f"Loss became infinite or NaN at iteration={storage.iter}!\n"
+ f"loss_dict = {metrics_dict}"
+ )
+
+ storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced)
+ if len(metrics_dict) > 1:
+ storage.put_scalars(**metrics_dict)
+
+ def state_dict(self):
+ ret = super().state_dict()
+ ret["optimizer"] = self.optimizer.state_dict()
+ return ret
+
+ def load_state_dict(self, state_dict):
+ super().load_state_dict(state_dict)
+ self.optimizer.load_state_dict(state_dict["optimizer"])
+
+
+class AMPTrainer(SimpleTrainer):
+ """
+ Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision
+ in the training loop.
+ """
+
+ def __init__(
+ self,
+ model,
+ data_loader,
+ optimizer,
+ gather_metric_period=1,
+ grad_scaler=None,
+ precision: torch.dtype = torch.float16,
+ log_grad_scaler: bool = False,
+ ):
+ """
+ Args:
+ model, data_loader, optimizer, gather_metric_period: same as in :class:`SimpleTrainer`.
+ grad_scaler: torch GradScaler to automatically scale gradients.
+ precision: torch.dtype as the target precision to cast to in computations
+ """
+ unsupported = "AMPTrainer does not support single-process multi-device training!"
+ if isinstance(model, DistributedDataParallel):
+ assert not (model.device_ids and len(model.device_ids) > 1), unsupported
+ assert not isinstance(model, DataParallel), unsupported
+
+ super().__init__(model, data_loader, optimizer, gather_metric_period)
+
+ if grad_scaler is None:
+ from torch.cuda.amp import GradScaler
+
+ grad_scaler = GradScaler()
+ self.grad_scaler = grad_scaler
+ self.precision = precision
+ self.log_grad_scaler = log_grad_scaler
+
+ def run_step(self):
+ """
+ Implement the AMP training logic.
+ """
+ assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
+ assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!"
+ from torch.cuda.amp import autocast
+
+ start = time.perf_counter()
+ data = next(self._data_loader_iter)
+ data_time = time.perf_counter() - start
+
+ with autocast(dtype=self.precision):
+ loss_dict = self.model(data)
+ if isinstance(loss_dict, torch.Tensor):
+ losses = loss_dict
+ loss_dict = {"total_loss": loss_dict}
+ else:
+ losses = sum(loss_dict.values())
+
+ self.optimizer.zero_grad()
+ self.grad_scaler.scale(losses).backward()
+
+ if self.log_grad_scaler:
+ storage = get_event_storage()
+ storage.put_scalar("[metric]grad_scaler", self.grad_scaler.get_scale())
+
+ self.after_backward()
+
+ self._write_metrics(loss_dict, data_time)
+
+ self.grad_scaler.step(self.optimizer)
+ self.grad_scaler.update()
+
+ def state_dict(self):
+ ret = super().state_dict()
+ ret["grad_scaler"] = self.grad_scaler.state_dict()
+ return ret
+
+ def load_state_dict(self, state_dict):
+ super().load_state_dict(state_dict)
+ self.grad_scaler.load_state_dict(state_dict["grad_scaler"])
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..306b4b1ab358312b1eee6697379acf7f7a4874af
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator
+from .coco_evaluation import COCOEvaluator
+from .rotated_coco_evaluation import RotatedCOCOEvaluator
+from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset
+from .lvis_evaluation import LVISEvaluator
+from .panoptic_evaluation import COCOPanopticEvaluator
+from .pascal_voc_evaluation import PascalVOCDetectionEvaluator
+from .sem_seg_evaluation import SemSegEvaluator
+from .testing import print_csv_format, verify_results
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/cityscapes_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/cityscapes_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..869f12385ed0fb8ee10397b0581cb484be7f8cfc
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/cityscapes_evaluation.py
@@ -0,0 +1,197 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import glob
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+import torch
+from PIL import Image
+
+from custom_detectron2.data import MetadataCatalog
+from custom_detectron2.utils import comm
+from custom_detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+
+class CityscapesEvaluator(DatasetEvaluator):
+ """
+ Base class for evaluation using cityscapes API.
+ """
+
+ def __init__(self, dataset_name):
+ """
+ Args:
+ dataset_name (str): the name of the dataset.
+ It must have the following metadata associated with it:
+ "thing_classes", "gt_dir".
+ """
+ self._metadata = MetadataCatalog.get(dataset_name)
+ self._cpu_device = torch.device("cpu")
+ self._logger = logging.getLogger(__name__)
+
+ def reset(self):
+ self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
+ self._temp_dir = self._working_dir.name
+ # All workers will write to the same results directory
+ # TODO this does not work in distributed training
+ assert (
+ comm.get_local_size() == comm.get_world_size()
+ ), "CityscapesEvaluator currently do not work with multiple machines."
+ self._temp_dir = comm.all_gather(self._temp_dir)[0]
+ if self._temp_dir != self._working_dir.name:
+ self._working_dir.cleanup()
+ self._logger.info(
+ "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
+ )
+
+
+class CityscapesInstanceEvaluator(CityscapesEvaluator):
+ """
+ Evaluate instance segmentation results on cityscapes dataset using cityscapes API.
+
+ Note:
+ * It does not work in multi-machine distributed training.
+ * It contains a synchronization, therefore has to be used on all ranks.
+ * Only the main process runs evaluation.
+ """
+
+ def process(self, inputs, outputs):
+ from cityscapesscripts.helpers.labels import name2label
+
+ for input, output in zip(inputs, outputs):
+ file_name = input["file_name"]
+ basename = os.path.splitext(os.path.basename(file_name))[0]
+ pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
+
+ if "instances" in output:
+ output = output["instances"].to(self._cpu_device)
+ num_instances = len(output)
+ with open(pred_txt, "w") as fout:
+ for i in range(num_instances):
+ pred_class = output.pred_classes[i]
+ classes = self._metadata.thing_classes[pred_class]
+ class_id = name2label[classes].id
+ score = output.scores[i]
+ mask = output.pred_masks[i].numpy().astype("uint8")
+ png_filename = os.path.join(
+ self._temp_dir, basename + "_{}_{}.png".format(i, classes)
+ )
+
+ Image.fromarray(mask * 255).save(png_filename)
+ fout.write(
+ "{} {} {}\n".format(os.path.basename(png_filename), class_id, score)
+ )
+ else:
+ # Cityscapes requires a prediction file for every ground truth image.
+ with open(pred_txt, "w") as fout:
+ pass
+
+ def evaluate(self):
+ """
+ Returns:
+ dict: has a key "segm", whose value is a dict of "AP" and "AP50".
+ """
+ comm.synchronize()
+ if comm.get_rank() > 0:
+ return
+ import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
+
+ self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+ # set some global states in cityscapes evaluation API, before evaluating
+ cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+ cityscapes_eval.args.predictionWalk = None
+ cityscapes_eval.args.JSONOutput = False
+ cityscapes_eval.args.colorized = False
+ cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
+
+ # These lines are adopted from
+ # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+ gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+ groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
+ assert len(
+ groundTruthImgList
+ ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+ cityscapes_eval.args.groundTruthSearch
+ )
+ predictionImgList = []
+ for gt in groundTruthImgList:
+ predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
+ results = cityscapes_eval.evaluateImgLists(
+ predictionImgList, groundTruthImgList, cityscapes_eval.args
+ )["averages"]
+
+ ret = OrderedDict()
+ ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
+ self._working_dir.cleanup()
+ return ret
+
+
+class CityscapesSemSegEvaluator(CityscapesEvaluator):
+ """
+ Evaluate semantic segmentation results on cityscapes dataset using cityscapes API.
+
+ Note:
+ * It does not work in multi-machine distributed training.
+ * It contains a synchronization, therefore has to be used on all ranks.
+ * Only the main process runs evaluation.
+ """
+
+ def process(self, inputs, outputs):
+ from cityscapesscripts.helpers.labels import trainId2label
+
+ for input, output in zip(inputs, outputs):
+ file_name = input["file_name"]
+ basename = os.path.splitext(os.path.basename(file_name))[0]
+ pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
+
+ output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
+ pred = 255 * np.ones(output.shape, dtype=np.uint8)
+ for train_id, label in trainId2label.items():
+ if label.ignoreInEval:
+ continue
+ pred[output == train_id] = label.id
+ Image.fromarray(pred).save(pred_filename)
+
+ def evaluate(self):
+ comm.synchronize()
+ if comm.get_rank() > 0:
+ return
+ # Load the Cityscapes eval script *after* setting the required env var,
+ # since the script reads CITYSCAPES_DATASET into global variables at load time.
+ import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
+
+ self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+ # set some global states in cityscapes evaluation API, before evaluating
+ cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+ cityscapes_eval.args.predictionWalk = None
+ cityscapes_eval.args.JSONOutput = False
+ cityscapes_eval.args.colorized = False
+
+ # These lines are adopted from
+ # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
+ gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+ groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
+ assert len(
+ groundTruthImgList
+ ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+ cityscapes_eval.args.groundTruthSearch
+ )
+ predictionImgList = []
+ for gt in groundTruthImgList:
+ predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
+ results = cityscapes_eval.evaluateImgLists(
+ predictionImgList, groundTruthImgList, cityscapes_eval.args
+ )
+ ret = OrderedDict()
+ ret["sem_seg"] = {
+ "IoU": 100.0 * results["averageScoreClasses"],
+ "iIoU": 100.0 * results["averageScoreInstClasses"],
+ "IoU_sup": 100.0 * results["averageScoreCategories"],
+ "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
+ }
+ self._working_dir.cleanup()
+ return ret
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/coco_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..92f6a44e485b9816e23a1b9094e3edf8d4e2e35a
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/coco_evaluation.py
@@ -0,0 +1,722 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import custom_pycocotools.mask as mask_util
+import torch
+from custom_pycocotools.coco import COCO
+from custom_pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+
+import custom_detectron2.utils.comm as comm
+from custom_detectron2.config import CfgNode
+from custom_detectron2.data import MetadataCatalog
+from custom_detectron2.data.datasets.coco import convert_to_coco_json
+from custom_detectron2.structures import Boxes, BoxMode, pairwise_iou
+from custom_detectron2.utils.file_io import PathManager
+from custom_detectron2.utils.logger import create_small_table
+
+from .evaluator import DatasetEvaluator
+
+try:
+ from custom_detectron2.evaluation.fast_eval_api import COCOeval_opt
+except ImportError:
+ COCOeval_opt = COCOeval
+
+
+class COCOEvaluator(DatasetEvaluator):
+ """
+ Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+ for keypoint detection outputs using COCO's metrics.
+ See http://cocodataset.org/#detection-eval and
+ http://cocodataset.org/#keypoints-eval to understand its metrics.
+ The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+ the metric cannot be computed (e.g. due to no predictions made).
+
+ In addition to COCO, this evaluator is able to support any bounding box detection,
+ instance segmentation, or keypoint detection dataset.
+ """
+
+ def __init__(
+ self,
+ dataset_name,
+ tasks=None,
+ distributed=True,
+ output_dir=None,
+ *,
+ max_dets_per_image=None,
+ use_fast_impl=True,
+ kpt_oks_sigmas=(),
+ allow_cached_coco=True,
+ ):
+ """
+ Args:
+ dataset_name (str): name of the dataset to be evaluated.
+ It must have either the following corresponding metadata:
+
+ "json_file": the path to the COCO format annotation
+
+ Or it must be in detectron2's standard dataset format
+ so it can be converted to COCO format automatically.
+ tasks (tuple[str]): tasks that can be evaluated under the given
+ configuration. A task is one of "bbox", "segm", "keypoints".
+ By default, will infer this automatically from predictions.
+ distributed (True): if True, will collect results from all ranks and run evaluation
+ in the main process.
+ Otherwise, will only evaluate the results in the current process.
+ output_dir (str): optional, an output directory to dump all
+ results predicted on the dataset. The dump contains two files:
+
+ 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+ contains all the results in the format they are produced by the model.
+ 2. "coco_instances_results.json" a json file in COCO's result format.
+ max_dets_per_image (int): limit on the maximum number of detections per image.
+ By default in COCO, this limit is to 100, but this can be customized
+ to be greater, as is needed in evaluation metrics AP fixed and AP pool
+ (see https://arxiv.org/pdf/2102.01066.pdf)
+ This doesn't affect keypoint evaluation.
+ use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+ Although the results should be very close to the official implementation in COCO
+ API, it is still recommended to compute results with the official API for use in
+ papers. The faster implementation also uses more RAM.
+ kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS.
+ See http://cocodataset.org/#keypoints-eval
+ When empty, it will use the defaults in COCO.
+ Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+ allow_cached_coco (bool): Whether to use cached coco json from previous validation
+ runs. You should set this to False if you need to use different validation data.
+ Defaults to True.
+ """
+ self._logger = logging.getLogger(__name__)
+ self._distributed = distributed
+ self._output_dir = output_dir
+
+ if use_fast_impl and (COCOeval_opt is COCOeval):
+ self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.")
+ use_fast_impl = False
+ self._use_fast_impl = use_fast_impl
+
+ # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
+ # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
+ # 3rd element (100) is used as the limit on the number of detections per image when
+ # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
+ # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
+ if max_dets_per_image is None:
+ max_dets_per_image = [1, 10, 100]
+ else:
+ max_dets_per_image = [1, 10, max_dets_per_image]
+ self._max_dets_per_image = max_dets_per_image
+
+ if tasks is not None and isinstance(tasks, CfgNode):
+ kpt_oks_sigmas = (
+ tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas
+ )
+ self._logger.warn(
+ "COCO Evaluator instantiated using config, this is deprecated behavior."
+ " Please pass in explicit arguments instead."
+ )
+ self._tasks = None # Infering it from predictions should be better
+ else:
+ self._tasks = tasks
+
+ self._cpu_device = torch.device("cpu")
+
+ self._metadata = MetadataCatalog.get(dataset_name)
+ if not hasattr(self._metadata, "json_file"):
+ if output_dir is None:
+ raise ValueError(
+ "output_dir must be provided to COCOEvaluator "
+ "for datasets not in COCO format."
+ )
+ self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...")
+
+ cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
+ self._metadata.json_file = cache_path
+ convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco)
+
+ json_file = PathManager.get_local_path(self._metadata.json_file)
+ with contextlib.redirect_stdout(io.StringIO()):
+ self._coco_api = COCO(json_file)
+
+ # Test set json files do not contain annotations (evaluation must be
+ # performed using the COCO evaluation server).
+ self._do_evaluation = "annotations" in self._coco_api.dataset
+ if self._do_evaluation:
+ self._kpt_oks_sigmas = kpt_oks_sigmas
+
+ def reset(self):
+ self._predictions = []
+
+ def process(self, inputs, outputs):
+ """
+ Args:
+ inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+ It is a list of dict. Each dict corresponds to an image and
+ contains keys like "height", "width", "file_name", "image_id".
+ outputs: the outputs of a COCO model. It is a list of dicts with key
+ "instances" that contains :class:`Instances`.
+ """
+ for input, output in zip(inputs, outputs):
+ prediction = {"image_id": input["image_id"]}
+
+ if "instances" in output:
+ instances = output["instances"].to(self._cpu_device)
+ prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+ if "proposals" in output:
+ prediction["proposals"] = output["proposals"].to(self._cpu_device)
+ if len(prediction) > 1:
+ self._predictions.append(prediction)
+
+ def evaluate(self, img_ids=None):
+ """
+ Args:
+ img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+ """
+ if self._distributed:
+ comm.synchronize()
+ predictions = comm.gather(self._predictions, dst=0)
+ predictions = list(itertools.chain(*predictions))
+
+ if not comm.is_main_process():
+ return {}
+ else:
+ predictions = self._predictions
+
+ if len(predictions) == 0:
+ self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
+ return {}
+
+ if self._output_dir:
+ PathManager.mkdirs(self._output_dir)
+ file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+ with PathManager.open(file_path, "wb") as f:
+ torch.save(predictions, f)
+
+ self._results = OrderedDict()
+ if "proposals" in predictions[0]:
+ self._eval_box_proposals(predictions)
+ if "instances" in predictions[0]:
+ self._eval_predictions(predictions, img_ids=img_ids)
+ # Copy so the caller can do whatever with results
+ return copy.deepcopy(self._results)
+
+ def _tasks_from_predictions(self, predictions):
+ """
+ Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions.
+ """
+ tasks = {"bbox"}
+ for pred in predictions:
+ if "segmentation" in pred:
+ tasks.add("segm")
+ if "keypoints" in pred:
+ tasks.add("keypoints")
+ return sorted(tasks)
+
+ def _eval_predictions(self, predictions, img_ids=None):
+ """
+ Evaluate predictions. Fill self._results with the metrics of the tasks.
+ """
+ self._logger.info("Preparing results for COCO format ...")
+ coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+ tasks = self._tasks or self._tasks_from_predictions(coco_results)
+
+ # unmap the category ids for COCO
+ if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+ dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+ all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+ num_classes = len(all_contiguous_ids)
+ assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+
+ reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+ for result in coco_results:
+ category_id = result["category_id"]
+ assert category_id < num_classes, (
+ f"A prediction has class={category_id}, "
+ f"but the dataset only has {num_classes} classes and "
+ f"predicted class id should be in [0, {num_classes - 1}]."
+ )
+ result["category_id"] = reverse_id_mapping[category_id]
+
+ if self._output_dir:
+ file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+ self._logger.info("Saving results to {}".format(file_path))
+ with PathManager.open(file_path, "w") as f:
+ f.write(json.dumps(coco_results))
+ f.flush()
+
+ if not self._do_evaluation:
+ self._logger.info("Annotations are not available for evaluation.")
+ return
+
+ self._logger.info(
+ "Evaluating predictions with {} COCO API...".format(
+ "unofficial" if self._use_fast_impl else "official"
+ )
+ )
+ for task in sorted(tasks):
+ assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+ coco_eval = (
+ _evaluate_predictions_on_coco(
+ self._coco_api,
+ coco_results,
+ task,
+ kpt_oks_sigmas=self._kpt_oks_sigmas,
+ cocoeval_fn=COCOeval_opt if self._use_fast_impl else COCOeval,
+ img_ids=img_ids,
+ max_dets_per_image=self._max_dets_per_image,
+ )
+ if len(coco_results) > 0
+ else None # cocoapi does not handle empty results very well
+ )
+
+ res = self._derive_coco_results(
+ coco_eval, task, class_names=self._metadata.get("thing_classes")
+ )
+ self._results[task] = res
+
+ def _eval_box_proposals(self, predictions):
+ """
+ Evaluate the box proposals in predictions.
+ Fill self._results with the metrics for "box_proposals" task.
+ """
+ if self._output_dir:
+ # Saving generated box proposals to file.
+ # Predicted box_proposals are in XYXY_ABS mode.
+ bbox_mode = BoxMode.XYXY_ABS.value
+ ids, boxes, objectness_logits = [], [], []
+ for prediction in predictions:
+ ids.append(prediction["image_id"])
+ boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+ objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+ proposal_data = {
+ "boxes": boxes,
+ "objectness_logits": objectness_logits,
+ "ids": ids,
+ "bbox_mode": bbox_mode,
+ }
+ with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+ pickle.dump(proposal_data, f)
+
+ if not self._do_evaluation:
+ self._logger.info("Annotations are not available for evaluation.")
+ return
+
+ self._logger.info("Evaluating bbox proposals ...")
+ res = {}
+ areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+ for limit in [100, 1000]:
+ for area, suffix in areas.items():
+ stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
+ key = "AR{}@{:d}".format(suffix, limit)
+ res[key] = float(stats["ar"].item() * 100)
+ self._logger.info("Proposal metrics: \n" + create_small_table(res))
+ self._results["box_proposals"] = res
+
+ def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+ """
+ Derive the desired score numbers from summarized COCOeval.
+
+ Args:
+ coco_eval (None or COCOEval): None represents no predictions from model.
+ iou_type (str):
+ class_names (None or list[str]): if provided, will use it to predict
+ per-category AP.
+
+ Returns:
+ a dict of {metric name: score}
+ """
+
+ metrics = {
+ "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+ "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+ "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+ }[iou_type]
+
+ if coco_eval is None:
+ self._logger.warn("No predictions from the model!")
+ return {metric: float("nan") for metric in metrics}
+
+ # the standard metrics
+ results = {
+ metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
+ for idx, metric in enumerate(metrics)
+ }
+ self._logger.info(
+ "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+ )
+ if not np.isfinite(sum(results.values())):
+ self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+
+ if class_names is None or len(class_names) <= 1:
+ return results
+ # Compute per-category AP
+ # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+ precisions = coco_eval.eval["precision"]
+ # precision has dims (iou, recall, cls, area range, max dets)
+ assert len(class_names) == precisions.shape[2]
+
+ results_per_category = []
+ for idx, name in enumerate(class_names):
+ # area range index 0: all area ranges
+ # max dets index -1: typically 100 per image
+ precision = precisions[:, :, idx, 0, -1]
+ precision = precision[precision > -1]
+ ap = np.mean(precision) if precision.size else float("nan")
+ results_per_category.append(("{}".format(name), float(ap * 100)))
+
+ # tabulate it
+ N_COLS = min(6, len(results_per_category) * 2)
+ results_flatten = list(itertools.chain(*results_per_category))
+ results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+ table = tabulate(
+ results_2d,
+ tablefmt="pipe",
+ floatfmt=".3f",
+ headers=["category", "AP"] * (N_COLS // 2),
+ numalign="left",
+ )
+ self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+
+ results.update({"AP-" + name: ap for name, ap in results_per_category})
+ return results
+
+
+def instances_to_coco_json(instances, img_id):
+ """
+ Dump an "Instances" object to a COCO-format json that's used for evaluation.
+
+ Args:
+ instances (Instances):
+ img_id (int): the image id
+
+ Returns:
+ list[dict]: list of json annotations in COCO format.
+ """
+ num_instance = len(instances)
+ if num_instance == 0:
+ return []
+
+ boxes = instances.pred_boxes.tensor.numpy()
+ boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+ boxes = boxes.tolist()
+ scores = instances.scores.tolist()
+ classes = instances.pred_classes.tolist()
+
+ has_mask = instances.has("pred_masks")
+ if has_mask:
+ # use RLE to encode the masks, because they are too large and takes memory
+ # since this evaluator stores outputs of the entire dataset
+ rles = [
+ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+ for mask in instances.pred_masks
+ ]
+ for rle in rles:
+ # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+ # json writer which always produces strings cannot serialize a bytestream
+ # unless you decode it. Thankfully, utf-8 works out (which is also what
+ # the custom_pycocotools/_mask.pyx does).
+ rle["counts"] = rle["counts"].decode("utf-8")
+
+ has_keypoints = instances.has("pred_keypoints")
+ if has_keypoints:
+ keypoints = instances.pred_keypoints
+
+ results = []
+ for k in range(num_instance):
+ result = {
+ "image_id": img_id,
+ "category_id": classes[k],
+ "bbox": boxes[k],
+ "score": scores[k],
+ }
+ if has_mask:
+ result["segmentation"] = rles[k]
+ if has_keypoints:
+ # In COCO annotations,
+ # keypoints coordinates are pixel indices.
+ # However our predictions are floating point coordinates.
+ # Therefore we subtract 0.5 to be consistent with the annotation format.
+ # This is the inverse of data loading logic in `datasets/coco.py`.
+ keypoints[k][:, :2] -= 0.5
+ result["keypoints"] = keypoints[k].flatten().tolist()
+ results.append(result)
+ return results
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
+ """
+ Evaluate detection proposal recall metrics. This function is a much
+ faster alternative to the official COCO API recall evaluation code. However,
+ it produces slightly different results.
+ """
+ # Record max overlap value for each gt box
+ # Return vector of overlap values
+ areas = {
+ "all": 0,
+ "small": 1,
+ "medium": 2,
+ "large": 3,
+ "96-128": 4,
+ "128-256": 5,
+ "256-512": 6,
+ "512-inf": 7,
+ }
+ area_ranges = [
+ [0**2, 1e5**2], # all
+ [0**2, 32**2], # small
+ [32**2, 96**2], # medium
+ [96**2, 1e5**2], # large
+ [96**2, 128**2], # 96-128
+ [128**2, 256**2], # 128-256
+ [256**2, 512**2], # 256-512
+ [512**2, 1e5**2],
+ ] # 512-inf
+ assert area in areas, "Unknown area range: {}".format(area)
+ area_range = area_ranges[areas[area]]
+ gt_overlaps = []
+ num_pos = 0
+
+ for prediction_dict in dataset_predictions:
+ predictions = prediction_dict["proposals"]
+
+ # sort predictions in descending order
+ # TODO maybe remove this and make it explicit in the documentation
+ inds = predictions.objectness_logits.sort(descending=True)[1]
+ predictions = predictions[inds]
+
+ ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
+ anno = coco_api.loadAnns(ann_ids)
+ gt_boxes = [
+ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+ for obj in anno
+ if obj["iscrowd"] == 0
+ ]
+ gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes
+ gt_boxes = Boxes(gt_boxes)
+ gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+
+ if len(gt_boxes) == 0 or len(predictions) == 0:
+ continue
+
+ valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+ gt_boxes = gt_boxes[valid_gt_inds]
+
+ num_pos += len(gt_boxes)
+
+ if len(gt_boxes) == 0:
+ continue
+
+ if limit is not None and len(predictions) > limit:
+ predictions = predictions[:limit]
+
+ overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+ _gt_overlaps = torch.zeros(len(gt_boxes))
+ for j in range(min(len(predictions), len(gt_boxes))):
+ # find which proposal box maximally covers each gt box
+ # and get the iou amount of coverage for each gt box
+ max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+ # find which gt box is 'best' covered (i.e. 'best' = most iou)
+ gt_ovr, gt_ind = max_overlaps.max(dim=0)
+ assert gt_ovr >= 0
+ # find the proposal box that covers the best covered gt box
+ box_ind = argmax_overlaps[gt_ind]
+ # record the iou coverage of this gt box
+ _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+ assert _gt_overlaps[j] == gt_ovr
+ # mark the proposal box and the gt box as used
+ overlaps[box_ind, :] = -1
+ overlaps[:, gt_ind] = -1
+
+ # append recorded iou coverage level
+ gt_overlaps.append(_gt_overlaps)
+ gt_overlaps = (
+ torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+ )
+ gt_overlaps, _ = torch.sort(gt_overlaps)
+
+ if thresholds is None:
+ step = 0.05
+ thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+ recalls = torch.zeros_like(thresholds)
+ # compute recall for each iou threshold
+ for i, t in enumerate(thresholds):
+ recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+ # ar = 2 * np.trapz(recalls, thresholds)
+ ar = recalls.mean()
+ return {
+ "ar": ar,
+ "recalls": recalls,
+ "thresholds": thresholds,
+ "gt_overlaps": gt_overlaps,
+ "num_pos": num_pos,
+ }
+
+
+def _evaluate_predictions_on_coco(
+ coco_gt,
+ coco_results,
+ iou_type,
+ kpt_oks_sigmas=None,
+ cocoeval_fn=COCOeval_opt,
+ img_ids=None,
+ max_dets_per_image=None,
+):
+ """
+ Evaluate the coco results using COCOEval API.
+ """
+ assert len(coco_results) > 0
+
+ if iou_type == "segm":
+ coco_results = copy.deepcopy(coco_results)
+ # When evaluating mask AP, if the results contain bbox, cocoapi will
+ # use the box area as the area of the instance, instead of the mask area.
+ # This leads to a different definition of small/medium/large.
+ # We remove the bbox field to let mask AP use mask area.
+ for c in coco_results:
+ c.pop("bbox", None)
+
+ coco_dt = coco_gt.loadRes(coco_results)
+ coco_eval = cocoeval_fn(coco_gt, coco_dt, iou_type)
+ # For COCO, the default max_dets_per_image is [1, 10, 100].
+ if max_dets_per_image is None:
+ max_dets_per_image = [1, 10, 100] # Default from COCOEval
+ else:
+ assert (
+ len(max_dets_per_image) >= 3
+ ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3"
+ # In the case that user supplies a custom input for max_dets_per_image,
+ # apply COCOevalMaxDets to evaluate AP with the custom input.
+ if max_dets_per_image[2] != 100:
+ coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type)
+ if iou_type != "keypoints":
+ coco_eval.params.maxDets = max_dets_per_image
+
+ if img_ids is not None:
+ coco_eval.params.imgIds = img_ids
+
+ if iou_type == "keypoints":
+ # Use the COCO default keypoint OKS sigmas unless overrides are specified
+ if kpt_oks_sigmas:
+ assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "custom_pycocotools is too old!"
+ coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
+ # COCOAPI requires every detection and every gt to have keypoints, so
+ # we just take the first entry from both
+ num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
+ num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
+ num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
+ assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
+ f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
+ f"Ground truth contains {num_keypoints_gt} keypoints. "
+ f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
+ "They have to agree with each other. For meaning of OKS, please refer to "
+ "http://cocodataset.org/#keypoints-eval."
+ )
+
+ coco_eval.evaluate()
+ coco_eval.accumulate()
+ coco_eval.summarize()
+
+ return coco_eval
+
+
+class COCOevalMaxDets(COCOeval):
+ """
+ Modified version of COCOeval for evaluating AP with a custom
+ maxDets (by default for COCO, maxDets is 100)
+ """
+
+ def summarize(self):
+ """
+ Compute and display summary metrics for evaluation results given
+ a custom value for max_dets_per_image
+ """
+
+ def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+ p = self.params
+ iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+ titleStr = "Average Precision" if ap == 1 else "Average Recall"
+ typeStr = "(AP)" if ap == 1 else "(AR)"
+ iouStr = (
+ "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+ if iouThr is None
+ else "{:0.2f}".format(iouThr)
+ )
+
+ aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+ mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+ if ap == 1:
+ # dimension of precision: [TxRxKxAxM]
+ s = self.eval["precision"]
+ # IoU
+ if iouThr is not None:
+ t = np.where(iouThr == p.iouThrs)[0]
+ s = s[t]
+ s = s[:, :, :, aind, mind]
+ else:
+ # dimension of recall: [TxKxAxM]
+ s = self.eval["recall"]
+ if iouThr is not None:
+ t = np.where(iouThr == p.iouThrs)[0]
+ s = s[t]
+ s = s[:, :, aind, mind]
+ if len(s[s > -1]) == 0:
+ mean_s = -1
+ else:
+ mean_s = np.mean(s[s > -1])
+ print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+ return mean_s
+
+ def _summarizeDets():
+ stats = np.zeros((12,))
+ # Evaluate AP using the custom limit on maximum detections per image
+ stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
+ stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+ stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+ stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+ stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+ stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+ stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+ stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+ stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+ stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+ stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+ stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+ return stats
+
+ def _summarizeKps():
+ stats = np.zeros((10,))
+ stats[0] = _summarize(1, maxDets=20)
+ stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+ stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+ stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+ stats[4] = _summarize(1, maxDets=20, areaRng="large")
+ stats[5] = _summarize(0, maxDets=20)
+ stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+ stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+ stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+ stats[9] = _summarize(0, maxDets=20, areaRng="large")
+ return stats
+
+ if not self.eval:
+ raise Exception("Please run accumulate() first")
+ iouType = self.params.iouType
+ if iouType == "segm" or iouType == "bbox":
+ summarize = _summarizeDets
+ elif iouType == "keypoints":
+ summarize = _summarizeKps
+ self.stats = summarize()
+
+ def __str__(self):
+ self.summarize()
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/evaluator.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad67f0ea00623ad27d4e319ed9fc641e91c26fc0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/evaluator.py
@@ -0,0 +1,224 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import datetime
+import logging
+import time
+from collections import OrderedDict, abc
+from contextlib import ExitStack, contextmanager
+from typing import List, Union
+import torch
+from torch import nn
+
+from custom_detectron2.utils.comm import get_world_size, is_main_process
+from custom_detectron2.utils.logger import log_every_n_seconds
+
+
+class DatasetEvaluator:
+ """
+ Base class for a dataset evaluator.
+
+ The function :func:`inference_on_dataset` runs the model over
+ all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+
+ This class will accumulate information of the inputs/outputs (by :meth:`process`),
+ and produce evaluation results in the end (by :meth:`evaluate`).
+ """
+
+ def reset(self):
+ """
+ Preparation for a new round of evaluation.
+ Should be called before starting a round of evaluation.
+ """
+ pass
+
+ def process(self, inputs, outputs):
+ """
+ Process the pair of inputs and outputs.
+ If they contain batches, the pairs can be consumed one-by-one using `zip`:
+
+ .. code-block:: python
+
+ for input_, output in zip(inputs, outputs):
+ # do evaluation on single input/output pair
+ ...
+
+ Args:
+ inputs (list): the inputs that's used to call the model.
+ outputs (list): the return value of `model(inputs)`
+ """
+ pass
+
+ def evaluate(self):
+ """
+ Evaluate/summarize the performance, after processing all input/output pairs.
+
+ Returns:
+ dict:
+ A new evaluator class can return a dict of arbitrary format
+ as long as the user can process the results.
+ In our train_net.py, we expect the following format:
+
+ * key: the name of the task (e.g., bbox)
+ * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
+ """
+ pass
+
+
+class DatasetEvaluators(DatasetEvaluator):
+ """
+ Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
+
+ This class dispatches every evaluation call to
+ all of its :class:`DatasetEvaluator`.
+ """
+
+ def __init__(self, evaluators):
+ """
+ Args:
+ evaluators (list): the evaluators to combine.
+ """
+ super().__init__()
+ self._evaluators = evaluators
+
+ def reset(self):
+ for evaluator in self._evaluators:
+ evaluator.reset()
+
+ def process(self, inputs, outputs):
+ for evaluator in self._evaluators:
+ evaluator.process(inputs, outputs)
+
+ def evaluate(self):
+ results = OrderedDict()
+ for evaluator in self._evaluators:
+ result = evaluator.evaluate()
+ if is_main_process() and result is not None:
+ for k, v in result.items():
+ assert (
+ k not in results
+ ), "Different evaluators produce results with the same key {}".format(k)
+ results[k] = v
+ return results
+
+
+def inference_on_dataset(
+ model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None]
+):
+ """
+ Run model on the data_loader and evaluate the metrics with evaluator.
+ Also benchmark the inference speed of `model.__call__` accurately.
+ The model will be used in eval mode.
+
+ Args:
+ model (callable): a callable which takes an object from
+ `data_loader` and returns some outputs.
+
+ If it's an nn.Module, it will be temporarily set to `eval` mode.
+ If you wish to evaluate a model in `training` mode instead, you can
+ wrap the given model and override its behavior of `.eval()` and `.train()`.
+ data_loader: an iterable object with a length.
+ The elements it generates will be the inputs to the model.
+ evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
+ but don't want to do any evaluation.
+
+ Returns:
+ The return value of `evaluator.evaluate()`
+ """
+ num_devices = get_world_size()
+ logger = logging.getLogger(__name__)
+ logger.info("Start inference on {} batches".format(len(data_loader)))
+
+ total = len(data_loader) # inference data loader must have a fixed length
+ if evaluator is None:
+ # create a no-op evaluator
+ evaluator = DatasetEvaluators([])
+ if isinstance(evaluator, abc.MutableSequence):
+ evaluator = DatasetEvaluators(evaluator)
+ evaluator.reset()
+
+ num_warmup = min(5, total - 1)
+ start_time = time.perf_counter()
+ total_data_time = 0
+ total_compute_time = 0
+ total_eval_time = 0
+ with ExitStack() as stack:
+ if isinstance(model, nn.Module):
+ stack.enter_context(inference_context(model))
+ stack.enter_context(torch.no_grad())
+
+ start_data_time = time.perf_counter()
+ for idx, inputs in enumerate(data_loader):
+ total_data_time += time.perf_counter() - start_data_time
+ if idx == num_warmup:
+ start_time = time.perf_counter()
+ total_data_time = 0
+ total_compute_time = 0
+ total_eval_time = 0
+
+ start_compute_time = time.perf_counter()
+ outputs = model(inputs)
+ if torch.cuda.is_available():
+ torch.cuda.synchronize()
+ total_compute_time += time.perf_counter() - start_compute_time
+
+ start_eval_time = time.perf_counter()
+ evaluator.process(inputs, outputs)
+ total_eval_time += time.perf_counter() - start_eval_time
+
+ iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+ data_seconds_per_iter = total_data_time / iters_after_start
+ compute_seconds_per_iter = total_compute_time / iters_after_start
+ eval_seconds_per_iter = total_eval_time / iters_after_start
+ total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
+ if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
+ eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
+ log_every_n_seconds(
+ logging.INFO,
+ (
+ f"Inference done {idx + 1}/{total}. "
+ f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
+ f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
+ f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
+ f"Total: {total_seconds_per_iter:.4f} s/iter. "
+ f"ETA={eta}"
+ ),
+ n=5,
+ )
+ start_data_time = time.perf_counter()
+
+ # Measure the time only for this worker (before the synchronization barrier)
+ total_time = time.perf_counter() - start_time
+ total_time_str = str(datetime.timedelta(seconds=total_time))
+ # NOTE this format is parsed by grep
+ logger.info(
+ "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
+ total_time_str, total_time / (total - num_warmup), num_devices
+ )
+ )
+ total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+ logger.info(
+ "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
+ total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+ )
+ )
+
+ results = evaluator.evaluate()
+ # An evaluator may return None when not in main process.
+ # Replace it by an empty dict instead to make it easier for downstream code to handle
+ if results is None:
+ results = {}
+ return results
+
+
+@contextmanager
+def inference_context(model):
+ """
+ A context where the model is temporarily changed to eval mode,
+ and restored to previous mode afterwards.
+
+ Args:
+ model: a torch Module
+ """
+ training_mode = model.training
+ model.eval()
+ yield
+ model.train(training_mode)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/fast_eval_api.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/fast_eval_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..df49c449cee1eafb3107583e919423a3f7ed42ba
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/fast_eval_api.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+import time
+from custom_pycocotools.cocoeval import COCOeval
+
+from custom_detectron2 import _C
+
+logger = logging.getLogger(__name__)
+
+
+class COCOeval_opt(COCOeval):
+ """
+ This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+ and accumulate() are implemented in C++ to speedup evaluation
+ """
+
+ def evaluate(self):
+ """
+ Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+ datastructure that isn't readable from Python but is used by a c++ implementation of
+ accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure
+ self.evalImgs because this datastructure is a computational bottleneck.
+ :return: None
+ """
+ tic = time.time()
+
+ p = self.params
+ # add backward compatibility if useSegm is specified in params
+ if p.useSegm is not None:
+ p.iouType = "segm" if p.useSegm == 1 else "bbox"
+ logger.info("Evaluate annotation type *{}*".format(p.iouType))
+ p.imgIds = list(np.unique(p.imgIds))
+ if p.useCats:
+ p.catIds = list(np.unique(p.catIds))
+ p.maxDets = sorted(p.maxDets)
+ self.params = p
+
+ self._prepare() # bottleneck
+
+ # loop through images, area range, max detection number
+ catIds = p.catIds if p.useCats else [-1]
+
+ if p.iouType == "segm" or p.iouType == "bbox":
+ computeIoU = self.computeIoU
+ elif p.iouType == "keypoints":
+ computeIoU = self.computeOks
+ self.ious = {
+ (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+ } # bottleneck
+
+ maxDet = p.maxDets[-1]
+
+ # <<<< Beginning of code differences with original COCO API
+ def convert_instances_to_cpp(instances, is_det=False):
+ # Convert annotations for a list of instances in an image to a format that's fast
+ # to access in C++
+ instances_cpp = []
+ for instance in instances:
+ instance_cpp = _C.InstanceAnnotation(
+ int(instance["id"]),
+ instance["score"] if is_det else instance.get("score", 0.0),
+ instance["area"],
+ bool(instance.get("iscrowd", 0)),
+ bool(instance.get("ignore", 0)),
+ )
+ instances_cpp.append(instance_cpp)
+ return instances_cpp
+
+ # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+ ground_truth_instances = [
+ [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+ for imgId in p.imgIds
+ ]
+ detected_instances = [
+ [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
+ for imgId in p.imgIds
+ ]
+ ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+ if not p.useCats:
+ # For each image, flatten per-category lists into a single list
+ ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
+ detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
+
+ # Call C++ implementation of self.evaluateImgs()
+ self._evalImgs_cpp = _C.COCOevalEvaluateImages(
+ p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
+ )
+ self._evalImgs = None
+
+ self._paramsEval = copy.deepcopy(self.params)
+ toc = time.time()
+ logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+ # >>>> End of code differences with original COCO API
+
+ def accumulate(self):
+ """
+ Accumulate per image evaluation results and store the result in self.eval. Does not
+ support changing parameter settings from those used by self.evaluate()
+ """
+ logger.info("Accumulating evaluation results...")
+ tic = time.time()
+ assert hasattr(
+ self, "_evalImgs_cpp"
+ ), "evaluate() must be called before accmulate() is called."
+
+ self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+ # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+ self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+ self.eval["counts"][:1] + self.eval["counts"][2:]
+ )
+
+ # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+ # num_area_ranges X num_max_detections
+ self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
+ self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+ toc = time.time()
+ logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic))
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/lvis_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/lvis_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..10707e074e1ccd51fbc9128c9d25af05273eba61
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/lvis_evaluation.py
@@ -0,0 +1,380 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import json
+import logging
+import os
+import pickle
+from collections import OrderedDict
+import torch
+
+import custom_detectron2.utils.comm as comm
+from custom_detectron2.config import CfgNode
+from custom_detectron2.data import MetadataCatalog
+from custom_detectron2.structures import Boxes, BoxMode, pairwise_iou
+from custom_detectron2.utils.file_io import PathManager
+from custom_detectron2.utils.logger import create_small_table
+
+from .coco_evaluation import instances_to_coco_json
+from .evaluator import DatasetEvaluator
+
+
+class LVISEvaluator(DatasetEvaluator):
+ """
+ Evaluate object proposal and instance detection/segmentation outputs using
+ LVIS's metrics and evaluation API.
+ """
+
+ def __init__(
+ self,
+ dataset_name,
+ tasks=None,
+ distributed=True,
+ output_dir=None,
+ *,
+ max_dets_per_image=None,
+ ):
+ """
+ Args:
+ dataset_name (str): name of the dataset to be evaluated.
+ It must have the following corresponding metadata:
+ "json_file": the path to the LVIS format annotation
+ tasks (tuple[str]): tasks that can be evaluated under the given
+ configuration. A task is one of "bbox", "segm".
+ By default, will infer this automatically from predictions.
+ distributed (True): if True, will collect results from all ranks for evaluation.
+ Otherwise, will evaluate the results in the current process.
+ output_dir (str): optional, an output directory to dump results.
+ max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP
+ This limit, by default of the LVIS dataset, is 300.
+ """
+ from lvis import LVIS
+
+ self._logger = logging.getLogger(__name__)
+
+ if tasks is not None and isinstance(tasks, CfgNode):
+ self._logger.warn(
+ "COCO Evaluator instantiated using config, this is deprecated behavior."
+ " Please pass in explicit arguments instead."
+ )
+ self._tasks = None # Infering it from predictions should be better
+ else:
+ self._tasks = tasks
+
+ self._distributed = distributed
+ self._output_dir = output_dir
+ self._max_dets_per_image = max_dets_per_image
+
+ self._cpu_device = torch.device("cpu")
+
+ self._metadata = MetadataCatalog.get(dataset_name)
+ json_file = PathManager.get_local_path(self._metadata.json_file)
+ self._lvis_api = LVIS(json_file)
+ # Test set json files do not contain annotations (evaluation must be
+ # performed using the LVIS evaluation server).
+ self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0
+
+ def reset(self):
+ self._predictions = []
+
+ def process(self, inputs, outputs):
+ """
+ Args:
+ inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN).
+ It is a list of dict. Each dict corresponds to an image and
+ contains keys like "height", "width", "file_name", "image_id".
+ outputs: the outputs of a LVIS model. It is a list of dicts with key
+ "instances" that contains :class:`Instances`.
+ """
+ for input, output in zip(inputs, outputs):
+ prediction = {"image_id": input["image_id"]}
+
+ if "instances" in output:
+ instances = output["instances"].to(self._cpu_device)
+ prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+ if "proposals" in output:
+ prediction["proposals"] = output["proposals"].to(self._cpu_device)
+ self._predictions.append(prediction)
+
+ def evaluate(self):
+ if self._distributed:
+ comm.synchronize()
+ predictions = comm.gather(self._predictions, dst=0)
+ predictions = list(itertools.chain(*predictions))
+
+ if not comm.is_main_process():
+ return
+ else:
+ predictions = self._predictions
+
+ if len(predictions) == 0:
+ self._logger.warning("[LVISEvaluator] Did not receive valid predictions.")
+ return {}
+
+ if self._output_dir:
+ PathManager.mkdirs(self._output_dir)
+ file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+ with PathManager.open(file_path, "wb") as f:
+ torch.save(predictions, f)
+
+ self._results = OrderedDict()
+ if "proposals" in predictions[0]:
+ self._eval_box_proposals(predictions)
+ if "instances" in predictions[0]:
+ self._eval_predictions(predictions)
+ # Copy so the caller can do whatever with results
+ return copy.deepcopy(self._results)
+
+ def _tasks_from_predictions(self, predictions):
+ for pred in predictions:
+ if "segmentation" in pred:
+ return ("bbox", "segm")
+ return ("bbox",)
+
+ def _eval_predictions(self, predictions):
+ """
+ Evaluate predictions. Fill self._results with the metrics of the tasks.
+
+ Args:
+ predictions (list[dict]): list of outputs from the model
+ """
+ self._logger.info("Preparing results in the LVIS format ...")
+ lvis_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+ tasks = self._tasks or self._tasks_from_predictions(lvis_results)
+
+ # LVIS evaluator can be used to evaluate results for COCO dataset categories.
+ # In this case `_metadata` variable will have a field with COCO-specific category mapping.
+ if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+ reverse_id_mapping = {
+ v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+ }
+ for result in lvis_results:
+ result["category_id"] = reverse_id_mapping[result["category_id"]]
+ else:
+ # unmap the category ids for LVIS (from 0-indexed to 1-indexed)
+ for result in lvis_results:
+ result["category_id"] += 1
+
+ if self._output_dir:
+ file_path = os.path.join(self._output_dir, "lvis_instances_results.json")
+ self._logger.info("Saving results to {}".format(file_path))
+ with PathManager.open(file_path, "w") as f:
+ f.write(json.dumps(lvis_results))
+ f.flush()
+
+ if not self._do_evaluation:
+ self._logger.info("Annotations are not available for evaluation.")
+ return
+
+ self._logger.info("Evaluating predictions ...")
+ for task in sorted(tasks):
+ res = _evaluate_predictions_on_lvis(
+ self._lvis_api,
+ lvis_results,
+ task,
+ max_dets_per_image=self._max_dets_per_image,
+ class_names=self._metadata.get("thing_classes"),
+ )
+ self._results[task] = res
+
+ def _eval_box_proposals(self, predictions):
+ """
+ Evaluate the box proposals in predictions.
+ Fill self._results with the metrics for "box_proposals" task.
+ """
+ if self._output_dir:
+ # Saving generated box proposals to file.
+ # Predicted box_proposals are in XYXY_ABS mode.
+ bbox_mode = BoxMode.XYXY_ABS.value
+ ids, boxes, objectness_logits = [], [], []
+ for prediction in predictions:
+ ids.append(prediction["image_id"])
+ boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+ objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+ proposal_data = {
+ "boxes": boxes,
+ "objectness_logits": objectness_logits,
+ "ids": ids,
+ "bbox_mode": bbox_mode,
+ }
+ with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+ pickle.dump(proposal_data, f)
+
+ if not self._do_evaluation:
+ self._logger.info("Annotations are not available for evaluation.")
+ return
+
+ self._logger.info("Evaluating bbox proposals ...")
+ res = {}
+ areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+ for limit in [100, 1000]:
+ for area, suffix in areas.items():
+ stats = _evaluate_box_proposals(predictions, self._lvis_api, area=area, limit=limit)
+ key = "AR{}@{:d}".format(suffix, limit)
+ res[key] = float(stats["ar"].item() * 100)
+ self._logger.info("Proposal metrics: \n" + create_small_table(res))
+ self._results["box_proposals"] = res
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None):
+ """
+ Evaluate detection proposal recall metrics. This function is a much
+ faster alternative to the official LVIS API recall evaluation code. However,
+ it produces slightly different results.
+ """
+ # Record max overlap value for each gt box
+ # Return vector of overlap values
+ areas = {
+ "all": 0,
+ "small": 1,
+ "medium": 2,
+ "large": 3,
+ "96-128": 4,
+ "128-256": 5,
+ "256-512": 6,
+ "512-inf": 7,
+ }
+ area_ranges = [
+ [0**2, 1e5**2], # all
+ [0**2, 32**2], # small
+ [32**2, 96**2], # medium
+ [96**2, 1e5**2], # large
+ [96**2, 128**2], # 96-128
+ [128**2, 256**2], # 128-256
+ [256**2, 512**2], # 256-512
+ [512**2, 1e5**2],
+ ] # 512-inf
+ assert area in areas, "Unknown area range: {}".format(area)
+ area_range = area_ranges[areas[area]]
+ gt_overlaps = []
+ num_pos = 0
+
+ for prediction_dict in dataset_predictions:
+ predictions = prediction_dict["proposals"]
+
+ # sort predictions in descending order
+ # TODO maybe remove this and make it explicit in the documentation
+ inds = predictions.objectness_logits.sort(descending=True)[1]
+ predictions = predictions[inds]
+
+ ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]])
+ anno = lvis_api.load_anns(ann_ids)
+ gt_boxes = [
+ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno
+ ]
+ gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes
+ gt_boxes = Boxes(gt_boxes)
+ gt_areas = torch.as_tensor([obj["area"] for obj in anno])
+
+ if len(gt_boxes) == 0 or len(predictions) == 0:
+ continue
+
+ valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+ gt_boxes = gt_boxes[valid_gt_inds]
+
+ num_pos += len(gt_boxes)
+
+ if len(gt_boxes) == 0:
+ continue
+
+ if limit is not None and len(predictions) > limit:
+ predictions = predictions[:limit]
+
+ overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+ _gt_overlaps = torch.zeros(len(gt_boxes))
+ for j in range(min(len(predictions), len(gt_boxes))):
+ # find which proposal box maximally covers each gt box
+ # and get the iou amount of coverage for each gt box
+ max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+ # find which gt box is 'best' covered (i.e. 'best' = most iou)
+ gt_ovr, gt_ind = max_overlaps.max(dim=0)
+ assert gt_ovr >= 0
+ # find the proposal box that covers the best covered gt box
+ box_ind = argmax_overlaps[gt_ind]
+ # record the iou coverage of this gt box
+ _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+ assert _gt_overlaps[j] == gt_ovr
+ # mark the proposal box and the gt box as used
+ overlaps[box_ind, :] = -1
+ overlaps[:, gt_ind] = -1
+
+ # append recorded iou coverage level
+ gt_overlaps.append(_gt_overlaps)
+ gt_overlaps = (
+ torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+ )
+ gt_overlaps, _ = torch.sort(gt_overlaps)
+
+ if thresholds is None:
+ step = 0.05
+ thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+ recalls = torch.zeros_like(thresholds)
+ # compute recall for each iou threshold
+ for i, t in enumerate(thresholds):
+ recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+ # ar = 2 * np.trapz(recalls, thresholds)
+ ar = recalls.mean()
+ return {
+ "ar": ar,
+ "recalls": recalls,
+ "thresholds": thresholds,
+ "gt_overlaps": gt_overlaps,
+ "num_pos": num_pos,
+ }
+
+
+def _evaluate_predictions_on_lvis(
+ lvis_gt, lvis_results, iou_type, max_dets_per_image=None, class_names=None
+):
+ """
+ Args:
+ iou_type (str):
+ max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP
+ This limit, by default of the LVIS dataset, is 300.
+ class_names (None or list[str]): if provided, will use it to predict
+ per-category AP.
+
+ Returns:
+ a dict of {metric name: score}
+ """
+ metrics = {
+ "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+ "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+ }[iou_type]
+
+ logger = logging.getLogger(__name__)
+
+ if len(lvis_results) == 0: # TODO: check if needed
+ logger.warn("No predictions from the model!")
+ return {metric: float("nan") for metric in metrics}
+
+ if iou_type == "segm":
+ lvis_results = copy.deepcopy(lvis_results)
+ # When evaluating mask AP, if the results contain bbox, LVIS API will
+ # use the box area as the area of the instance, instead of the mask area.
+ # This leads to a different definition of small/medium/large.
+ # We remove the bbox field to let mask AP use mask area.
+ for c in lvis_results:
+ c.pop("bbox", None)
+
+ if max_dets_per_image is None:
+ max_dets_per_image = 300 # Default for LVIS dataset
+
+ from lvis import LVISEval, LVISResults
+
+ logger.info(f"Evaluating with max detections per image = {max_dets_per_image}")
+ lvis_results = LVISResults(lvis_gt, lvis_results, max_dets=max_dets_per_image)
+ lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type)
+ lvis_eval.run()
+ lvis_eval.print_results()
+
+ # Pull the standard metrics from the LVIS results
+ results = lvis_eval.get_results()
+ results = {metric: float(results[metric] * 100) for metric in metrics}
+ logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results))
+ return results
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/panoptic_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/panoptic_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..783a3e0657a0428d2e22a7e58d57f8d1c278e5e0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/panoptic_evaluation.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+from typing import Optional
+from PIL import Image
+from tabulate import tabulate
+
+from custom_detectron2.data import MetadataCatalog
+from custom_detectron2.utils import comm
+from custom_detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class COCOPanopticEvaluator(DatasetEvaluator):
+ """
+ Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
+ It saves panoptic segmentation prediction in `output_dir`
+
+ It contains a synchronize call and has to be called from all workers.
+ """
+
+ def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
+ """
+ Args:
+ dataset_name: name of the dataset
+ output_dir: output directory to save results for evaluation.
+ """
+ self._metadata = MetadataCatalog.get(dataset_name)
+ self._thing_contiguous_id_to_dataset_id = {
+ v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+ }
+ self._stuff_contiguous_id_to_dataset_id = {
+ v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
+ }
+
+ self._output_dir = output_dir
+ if self._output_dir is not None:
+ PathManager.mkdirs(self._output_dir)
+
+ def reset(self):
+ self._predictions = []
+
+ def _convert_category_id(self, segment_info):
+ isthing = segment_info.pop("isthing", None)
+ if isthing is None:
+ # the model produces panoptic category id directly. No more conversion needed
+ return segment_info
+ if isthing is True:
+ segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
+ segment_info["category_id"]
+ ]
+ else:
+ segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
+ segment_info["category_id"]
+ ]
+ return segment_info
+
+ def process(self, inputs, outputs):
+ from panopticapi.utils import id2rgb
+
+ for input, output in zip(inputs, outputs):
+ panoptic_img, segments_info = output["panoptic_seg"]
+ panoptic_img = panoptic_img.cpu().numpy()
+ if segments_info is None:
+ # If "segments_info" is None, we assume "panoptic_img" is a
+ # H*W int32 image storing the panoptic_id in the format of
+ # category_id * label_divisor + instance_id. We reserve -1 for
+ # VOID label, and add 1 to panoptic_img since the official
+ # evaluation script uses 0 for VOID label.
+ label_divisor = self._metadata.label_divisor
+ segments_info = []
+ for panoptic_label in np.unique(panoptic_img):
+ if panoptic_label == -1:
+ # VOID region.
+ continue
+ pred_class = panoptic_label // label_divisor
+ isthing = (
+ pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
+ )
+ segments_info.append(
+ {
+ "id": int(panoptic_label) + 1,
+ "category_id": int(pred_class),
+ "isthing": bool(isthing),
+ }
+ )
+ # Official evaluation script uses 0 for VOID label.
+ panoptic_img += 1
+
+ file_name = os.path.basename(input["file_name"])
+ file_name_png = os.path.splitext(file_name)[0] + ".png"
+ with io.BytesIO() as out:
+ Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+ segments_info = [self._convert_category_id(x) for x in segments_info]
+ self._predictions.append(
+ {
+ "image_id": input["image_id"],
+ "file_name": file_name_png,
+ "png_string": out.getvalue(),
+ "segments_info": segments_info,
+ }
+ )
+
+ def evaluate(self):
+ comm.synchronize()
+
+ self._predictions = comm.gather(self._predictions)
+ self._predictions = list(itertools.chain(*self._predictions))
+ if not comm.is_main_process():
+ return
+
+ # PanopticApi requires local files
+ gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+ gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
+
+ with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+ logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+ for p in self._predictions:
+ with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+ f.write(p.pop("png_string"))
+
+ with open(gt_json, "r") as f:
+ json_data = json.load(f)
+ json_data["annotations"] = self._predictions
+
+ output_dir = self._output_dir or pred_dir
+ predictions_json = os.path.join(output_dir, "predictions.json")
+ with PathManager.open(predictions_json, "w") as f:
+ f.write(json.dumps(json_data))
+
+ from panopticapi.evaluation import pq_compute
+
+ with contextlib.redirect_stdout(io.StringIO()):
+ pq_res = pq_compute(
+ gt_json,
+ PathManager.get_local_path(predictions_json),
+ gt_folder=gt_folder,
+ pred_folder=pred_dir,
+ )
+
+ res = {}
+ res["PQ"] = 100 * pq_res["All"]["pq"]
+ res["SQ"] = 100 * pq_res["All"]["sq"]
+ res["RQ"] = 100 * pq_res["All"]["rq"]
+ res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+ res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+ res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+ res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+ res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+ res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+
+ results = OrderedDict({"panoptic_seg": res})
+ _print_panoptic_results(pq_res)
+
+ return results
+
+
+def _print_panoptic_results(pq_res):
+ headers = ["", "PQ", "SQ", "RQ", "#categories"]
+ data = []
+ for name in ["All", "Things", "Stuff"]:
+ row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+ data.append(row)
+ table = tabulate(
+ data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+ )
+ logger.info("Panoptic Evaluation Results:\n" + table)
+
+
+if __name__ == "__main__":
+ from custom_detectron2.utils.logger import setup_logger
+
+ logger = setup_logger()
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--gt-json")
+ parser.add_argument("--gt-dir")
+ parser.add_argument("--pred-json")
+ parser.add_argument("--pred-dir")
+ args = parser.parse_args()
+
+ from panopticapi.evaluation import pq_compute
+
+ with contextlib.redirect_stdout(io.StringIO()):
+ pq_res = pq_compute(
+ args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
+ )
+ _print_panoptic_results(pq_res)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/pascal_voc_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/pascal_voc_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f647e9a260ba4dbe66ba4d1495b0051a177e22
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/pascal_voc_evaluation.py
@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+import numpy as np
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+import torch
+
+from custom_detectron2.data import MetadataCatalog
+from custom_detectron2.utils import comm
+from custom_detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+ """
+ Evaluate Pascal VOC style AP for Pascal VOC dataset.
+ It contains a synchronization, therefore has to be called from all ranks.
+
+ Note that the concept of AP can be implemented in different ways and may not
+ produce identical results. This class mimics the implementation of the official
+ Pascal VOC Matlab API, and should produce similar but not identical results to the
+ official API.
+ """
+
+ def __init__(self, dataset_name):
+ """
+ Args:
+ dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+ """
+ self._dataset_name = dataset_name
+ meta = MetadataCatalog.get(dataset_name)
+
+ # Too many tiny files, download all to local for speed.
+ annotation_dir_local = PathManager.get_local_path(
+ os.path.join(meta.dirname, "Annotations/")
+ )
+ self._anno_file_template = os.path.join(annotation_dir_local, "{}.xml")
+ self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+ self._class_names = meta.thing_classes
+ assert meta.year in [2007, 2012], meta.year
+ self._is_2007 = meta.year == 2007
+ self._cpu_device = torch.device("cpu")
+ self._logger = logging.getLogger(__name__)
+
+ def reset(self):
+ self._predictions = defaultdict(list) # class name -> list of prediction strings
+
+ def process(self, inputs, outputs):
+ for input, output in zip(inputs, outputs):
+ image_id = input["image_id"]
+ instances = output["instances"].to(self._cpu_device)
+ boxes = instances.pred_boxes.tensor.numpy()
+ scores = instances.scores.tolist()
+ classes = instances.pred_classes.tolist()
+ for box, score, cls in zip(boxes, scores, classes):
+ xmin, ymin, xmax, ymax = box
+ # The inverse of data loading logic in `datasets/pascal_voc.py`
+ xmin += 1
+ ymin += 1
+ self._predictions[cls].append(
+ f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+ )
+
+ def evaluate(self):
+ """
+ Returns:
+ dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+ """
+ all_predictions = comm.gather(self._predictions, dst=0)
+ if not comm.is_main_process():
+ return
+ predictions = defaultdict(list)
+ for predictions_per_rank in all_predictions:
+ for clsid, lines in predictions_per_rank.items():
+ predictions[clsid].extend(lines)
+ del all_predictions
+
+ self._logger.info(
+ "Evaluating {} using {} metric. "
+ "Note that results do not use the official Matlab API.".format(
+ self._dataset_name, 2007 if self._is_2007 else 2012
+ )
+ )
+
+ with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+ res_file_template = os.path.join(dirname, "{}.txt")
+
+ aps = defaultdict(list) # iou -> ap per class
+ for cls_id, cls_name in enumerate(self._class_names):
+ lines = predictions.get(cls_id, [""])
+
+ with open(res_file_template.format(cls_name), "w") as f:
+ f.write("\n".join(lines))
+
+ for thresh in range(50, 100, 5):
+ rec, prec, ap = voc_eval(
+ res_file_template,
+ self._anno_file_template,
+ self._image_set_path,
+ cls_name,
+ ovthresh=thresh / 100.0,
+ use_07_metric=self._is_2007,
+ )
+ aps[thresh].append(ap * 100)
+
+ ret = OrderedDict()
+ mAP = {iou: np.mean(x) for iou, x in aps.items()}
+ ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]}
+ return ret
+
+
+##############################################################################
+#
+# Below code is modified from
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+"""Python implementation of the PASCAL VOC devkit's AP evaluation code."""
+
+
+@lru_cache(maxsize=None)
+def parse_rec(filename):
+ """Parse a PASCAL VOC xml file."""
+ with PathManager.open(filename) as f:
+ tree = ET.parse(f)
+ objects = []
+ for obj in tree.findall("object"):
+ obj_struct = {}
+ obj_struct["name"] = obj.find("name").text
+ obj_struct["pose"] = obj.find("pose").text
+ obj_struct["truncated"] = int(obj.find("truncated").text)
+ obj_struct["difficult"] = int(obj.find("difficult").text)
+ bbox = obj.find("bndbox")
+ obj_struct["bbox"] = [
+ int(bbox.find("xmin").text),
+ int(bbox.find("ymin").text),
+ int(bbox.find("xmax").text),
+ int(bbox.find("ymax").text),
+ ]
+ objects.append(obj_struct)
+
+ return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+ """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+ the VOC 07 11-point method (default:False).
+ """
+ if use_07_metric:
+ # 11 point metric
+ ap = 0.0
+ for t in np.arange(0.0, 1.1, 0.1):
+ if np.sum(rec >= t) == 0:
+ p = 0
+ else:
+ p = np.max(prec[rec >= t])
+ ap = ap + p / 11.0
+ else:
+ # correct AP calculation
+ # first append sentinel values at the end
+ mrec = np.concatenate(([0.0], rec, [1.0]))
+ mpre = np.concatenate(([0.0], prec, [0.0]))
+
+ # compute the precision envelope
+ for i in range(mpre.size - 1, 0, -1):
+ mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+ # to calculate area under PR curve, look for points
+ # where X axis (recall) changes value
+ i = np.where(mrec[1:] != mrec[:-1])[0]
+
+ # and sum (\Delta recall) * prec
+ ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+ return ap
+
+
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False):
+ """rec, prec, ap = voc_eval(detpath,
+ annopath,
+ imagesetfile,
+ classname,
+ [ovthresh],
+ [use_07_metric])
+
+ Top level function that does the PASCAL VOC evaluation.
+
+ detpath: Path to detections
+ detpath.format(classname) should produce the detection results file.
+ annopath: Path to annotations
+ annopath.format(imagename) should be the xml annotations file.
+ imagesetfile: Text file containing the list of images, one image per line.
+ classname: Category name (duh)
+ [ovthresh]: Overlap threshold (default = 0.5)
+ [use_07_metric]: Whether to use VOC07's 11 point AP computation
+ (default False)
+ """
+ # assumes detections are in detpath.format(classname)
+ # assumes annotations are in annopath.format(imagename)
+ # assumes imagesetfile is a text file with each line an image name
+
+ # first load gt
+ # read list of images
+ with PathManager.open(imagesetfile, "r") as f:
+ lines = f.readlines()
+ imagenames = [x.strip() for x in lines]
+
+ # load annots
+ recs = {}
+ for imagename in imagenames:
+ recs[imagename] = parse_rec(annopath.format(imagename))
+
+ # extract gt objects for this class
+ class_recs = {}
+ npos = 0
+ for imagename in imagenames:
+ R = [obj for obj in recs[imagename] if obj["name"] == classname]
+ bbox = np.array([x["bbox"] for x in R])
+ difficult = np.array([x["difficult"] for x in R]).astype(bool)
+ # difficult = np.array([False for x in R]).astype(bool) # treat all "difficult" as GT
+ det = [False] * len(R)
+ npos = npos + sum(~difficult)
+ class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+ # read dets
+ detfile = detpath.format(classname)
+ with open(detfile, "r") as f:
+ lines = f.readlines()
+
+ splitlines = [x.strip().split(" ") for x in lines]
+ image_ids = [x[0] for x in splitlines]
+ confidence = np.array([float(x[1]) for x in splitlines])
+ BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4)
+
+ # sort by confidence
+ sorted_ind = np.argsort(-confidence)
+ BB = BB[sorted_ind, :]
+ image_ids = [image_ids[x] for x in sorted_ind]
+
+ # go down dets and mark TPs and FPs
+ nd = len(image_ids)
+ tp = np.zeros(nd)
+ fp = np.zeros(nd)
+ for d in range(nd):
+ R = class_recs[image_ids[d]]
+ bb = BB[d, :].astype(float)
+ ovmax = -np.inf
+ BBGT = R["bbox"].astype(float)
+
+ if BBGT.size > 0:
+ # compute overlaps
+ # intersection
+ ixmin = np.maximum(BBGT[:, 0], bb[0])
+ iymin = np.maximum(BBGT[:, 1], bb[1])
+ ixmax = np.minimum(BBGT[:, 2], bb[2])
+ iymax = np.minimum(BBGT[:, 3], bb[3])
+ iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+ ih = np.maximum(iymax - iymin + 1.0, 0.0)
+ inters = iw * ih
+
+ # union
+ uni = (
+ (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+ + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+ - inters
+ )
+
+ overlaps = inters / uni
+ ovmax = np.max(overlaps)
+ jmax = np.argmax(overlaps)
+
+ if ovmax > ovthresh:
+ if not R["difficult"][jmax]:
+ if not R["det"][jmax]:
+ tp[d] = 1.0
+ R["det"][jmax] = 1
+ else:
+ fp[d] = 1.0
+ else:
+ fp[d] = 1.0
+
+ # compute precision recall
+ fp = np.cumsum(fp)
+ tp = np.cumsum(tp)
+ rec = tp / float(npos)
+ # avoid divide by zero in case the first detection matches a difficult
+ # ground truth
+ prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+ ap = voc_ap(rec, prec, use_07_metric)
+
+ return rec, prec, ap
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/rotated_coco_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/rotated_coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3da22b9813aa3e0965f022f8fd72ead3a43a8d47
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/rotated_coco_evaluation.py
@@ -0,0 +1,207 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import numpy as np
+import os
+import torch
+from custom_pycocotools.cocoeval import COCOeval, maskUtils
+
+from custom_detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated
+from custom_detectron2.utils.file_io import PathManager
+
+from .coco_evaluation import COCOEvaluator
+
+
+class RotatedCOCOeval(COCOeval):
+ @staticmethod
+ def is_rotated(box_list):
+ if type(box_list) == np.ndarray:
+ return box_list.shape[1] == 5
+ elif type(box_list) == list:
+ if box_list == []: # cannot decide the box_dim
+ return False
+ return np.all(
+ np.array(
+ [
+ (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray))
+ for obj in box_list
+ ]
+ )
+ )
+ return False
+
+ @staticmethod
+ def boxlist_to_tensor(boxlist, output_box_dim):
+ if type(boxlist) == np.ndarray:
+ box_tensor = torch.from_numpy(boxlist)
+ elif type(boxlist) == list:
+ if boxlist == []:
+ return torch.zeros((0, output_box_dim), dtype=torch.float32)
+ else:
+ box_tensor = torch.FloatTensor(boxlist)
+ else:
+ raise Exception("Unrecognized boxlist type")
+
+ input_box_dim = box_tensor.shape[1]
+ if input_box_dim != output_box_dim:
+ if input_box_dim == 4 and output_box_dim == 5:
+ box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
+ else:
+ raise Exception(
+ "Unable to convert from {}-dim box to {}-dim box".format(
+ input_box_dim, output_box_dim
+ )
+ )
+ return box_tensor
+
+ def compute_iou_dt_gt(self, dt, gt, is_crowd):
+ if self.is_rotated(dt) or self.is_rotated(gt):
+ # TODO: take is_crowd into consideration
+ assert all(c == 0 for c in is_crowd)
+ dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5))
+ gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5))
+ return pairwise_iou_rotated(dt, gt)
+ else:
+ # This is the same as the classical COCO evaluation
+ return maskUtils.iou(dt, gt, is_crowd)
+
+ def computeIoU(self, imgId, catId):
+ p = self.params
+ if p.useCats:
+ gt = self._gts[imgId, catId]
+ dt = self._dts[imgId, catId]
+ else:
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+ if len(gt) == 0 and len(dt) == 0:
+ return []
+ inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+ dt = [dt[i] for i in inds]
+ if len(dt) > p.maxDets[-1]:
+ dt = dt[0 : p.maxDets[-1]]
+
+ assert p.iouType == "bbox", "unsupported iouType for iou computation"
+
+ g = [g["bbox"] for g in gt]
+ d = [d["bbox"] for d in dt]
+
+ # compute iou between each dt and gt region
+ iscrowd = [int(o["iscrowd"]) for o in gt]
+
+ # Note: this function is copied from cocoeval.py in cocoapi
+ # and the major difference is here.
+ ious = self.compute_iou_dt_gt(d, g, iscrowd)
+ return ious
+
+
+class RotatedCOCOEvaluator(COCOEvaluator):
+ """
+ Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs,
+ with rotated boxes support.
+ Note: this uses IOU only and does not consider angle differences.
+ """
+
+ def process(self, inputs, outputs):
+ """
+ Args:
+ inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+ It is a list of dict. Each dict corresponds to an image and
+ contains keys like "height", "width", "file_name", "image_id".
+ outputs: the outputs of a COCO model. It is a list of dicts with key
+ "instances" that contains :class:`Instances`.
+ """
+ for input, output in zip(inputs, outputs):
+ prediction = {"image_id": input["image_id"]}
+
+ if "instances" in output:
+ instances = output["instances"].to(self._cpu_device)
+
+ prediction["instances"] = self.instances_to_json(instances, input["image_id"])
+ if "proposals" in output:
+ prediction["proposals"] = output["proposals"].to(self._cpu_device)
+ self._predictions.append(prediction)
+
+ def instances_to_json(self, instances, img_id):
+ num_instance = len(instances)
+ if num_instance == 0:
+ return []
+
+ boxes = instances.pred_boxes.tensor.numpy()
+ if boxes.shape[1] == 4:
+ boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+ boxes = boxes.tolist()
+ scores = instances.scores.tolist()
+ classes = instances.pred_classes.tolist()
+
+ results = []
+ for k in range(num_instance):
+ result = {
+ "image_id": img_id,
+ "category_id": classes[k],
+ "bbox": boxes[k],
+ "score": scores[k],
+ }
+
+ results.append(result)
+ return results
+
+ def _eval_predictions(self, predictions, img_ids=None): # img_ids: unused
+ """
+ Evaluate predictions on the given tasks.
+ Fill self._results with the metrics of the tasks.
+ """
+ self._logger.info("Preparing results for COCO format ...")
+ coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+
+ # unmap the category ids for COCO
+ if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+ reverse_id_mapping = {
+ v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+ }
+ for result in coco_results:
+ result["category_id"] = reverse_id_mapping[result["category_id"]]
+
+ if self._output_dir:
+ file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+ self._logger.info("Saving results to {}".format(file_path))
+ with PathManager.open(file_path, "w") as f:
+ f.write(json.dumps(coco_results))
+ f.flush()
+
+ if not self._do_evaluation:
+ self._logger.info("Annotations are not available for evaluation.")
+ return
+
+ self._logger.info("Evaluating predictions ...")
+
+ assert self._tasks is None or set(self._tasks) == {
+ "bbox"
+ }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported"
+ coco_eval = (
+ self._evaluate_predictions_on_coco(self._coco_api, coco_results)
+ if len(coco_results) > 0
+ else None # cocoapi does not handle empty results very well
+ )
+
+ task = "bbox"
+ res = self._derive_coco_results(
+ coco_eval, task, class_names=self._metadata.get("thing_classes")
+ )
+ self._results[task] = res
+
+ def _evaluate_predictions_on_coco(self, coco_gt, coco_results):
+ """
+ Evaluate the coco results using COCOEval API.
+ """
+ assert len(coco_results) > 0
+
+ coco_dt = coco_gt.loadRes(coco_results)
+
+ # Only bbox is supported for now
+ coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox")
+
+ coco_eval.evaluate()
+ coco_eval.accumulate()
+ coco_eval.summarize()
+
+ return coco_eval
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/sem_seg_evaluation.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/sem_seg_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef629c2f840f484d5d87e42ff9f2d7f4cc6e33b5
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/sem_seg_evaluation.py
@@ -0,0 +1,265 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+from typing import Optional, Union
+import custom_pycocotools.mask as mask_util
+import torch
+from PIL import Image
+
+from custom_detectron2.data import DatasetCatalog, MetadataCatalog
+from custom_detectron2.utils.comm import all_gather, is_main_process, synchronize
+from custom_detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+_CV2_IMPORTED = True
+try:
+ import cv2 # noqa
+except ImportError:
+ # OpenCV is an optional dependency at the moment
+ _CV2_IMPORTED = False
+
+
+def load_image_into_numpy_array(
+ filename: str,
+ copy: bool = False,
+ dtype: Optional[Union[np.dtype, str]] = None,
+) -> np.ndarray:
+ with PathManager.open(filename, "rb") as f:
+ array = np.array(Image.open(f), copy=copy, dtype=dtype)
+ return array
+
+
+class SemSegEvaluator(DatasetEvaluator):
+ """
+ Evaluate semantic segmentation metrics.
+ """
+
+ def __init__(
+ self,
+ dataset_name,
+ distributed=True,
+ output_dir=None,
+ *,
+ sem_seg_loading_fn=load_image_into_numpy_array,
+ num_classes=None,
+ ignore_label=None,
+ ):
+ """
+ Args:
+ dataset_name (str): name of the dataset to be evaluated.
+ distributed (bool): if True, will collect results from all ranks for evaluation.
+ Otherwise, will evaluate the results in the current process.
+ output_dir (str): an output directory to dump results.
+ sem_seg_loading_fn: function to read sem seg file and load into numpy array.
+ Default provided, but projects can customize.
+ num_classes, ignore_label: deprecated argument
+ """
+ self._logger = logging.getLogger(__name__)
+ if num_classes is not None:
+ self._logger.warn(
+ "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata."
+ )
+ if ignore_label is not None:
+ self._logger.warn(
+ "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata."
+ )
+ self._dataset_name = dataset_name
+ self._distributed = distributed
+ self._output_dir = output_dir
+
+ self._cpu_device = torch.device("cpu")
+
+ self.input_file_to_gt_file = {
+ dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
+ for dataset_record in DatasetCatalog.get(dataset_name)
+ }
+
+ meta = MetadataCatalog.get(dataset_name)
+ # Dict that maps contiguous training ids to COCO category ids
+ try:
+ c2d = meta.stuff_dataset_id_to_contiguous_id
+ self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
+ except AttributeError:
+ self._contiguous_id_to_dataset_id = None
+ self._class_names = meta.stuff_classes
+ self.sem_seg_loading_fn = sem_seg_loading_fn
+ self._num_classes = len(meta.stuff_classes)
+ if num_classes is not None:
+ assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}"
+ self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label
+
+ # This is because cv2.erode did not work for int datatype. Only works for uint8.
+ self._compute_boundary_iou = True
+ if not _CV2_IMPORTED:
+ self._compute_boundary_iou = False
+ self._logger.warn(
+ """Boundary IoU calculation requires OpenCV. B-IoU metrics are
+ not going to be computed because OpenCV is not available to import."""
+ )
+ if self._num_classes >= np.iinfo(np.uint8).max:
+ self._compute_boundary_iou = False
+ self._logger.warn(
+ f"""SemSegEvaluator(num_classes) is more than supported value for Boundary IoU calculation!
+ B-IoU metrics are not going to be computed. Max allowed value (exclusive)
+ for num_classes for calculating Boundary IoU is {np.iinfo(np.uint8).max}.
+ The number of classes of dataset {self._dataset_name} is {self._num_classes}"""
+ )
+
+ def reset(self):
+ self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64)
+ self._b_conf_matrix = np.zeros(
+ (self._num_classes + 1, self._num_classes + 1), dtype=np.int64
+ )
+ self._predictions = []
+
+ def process(self, inputs, outputs):
+ """
+ Args:
+ inputs: the inputs to a model.
+ It is a list of dicts. Each dict corresponds to an image and
+ contains keys like "height", "width", "file_name".
+ outputs: the outputs of a model. It is either list of semantic segmentation predictions
+ (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+ segmentation prediction in the same format.
+ """
+ for input, output in zip(inputs, outputs):
+ output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
+ pred = np.array(output, dtype=np.int)
+ gt_filename = self.input_file_to_gt_file[input["file_name"]]
+ gt = self.sem_seg_loading_fn(gt_filename, dtype=np.int)
+
+ gt[gt == self._ignore_label] = self._num_classes
+
+ self._conf_matrix += np.bincount(
+ (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
+ minlength=self._conf_matrix.size,
+ ).reshape(self._conf_matrix.shape)
+
+ if self._compute_boundary_iou:
+ b_gt = self._mask_to_boundary(gt.astype(np.uint8))
+ b_pred = self._mask_to_boundary(pred.astype(np.uint8))
+
+ self._b_conf_matrix += np.bincount(
+ (self._num_classes + 1) * b_pred.reshape(-1) + b_gt.reshape(-1),
+ minlength=self._conf_matrix.size,
+ ).reshape(self._conf_matrix.shape)
+
+ self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+
+ def evaluate(self):
+ """
+ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+
+ * Mean intersection-over-union averaged across classes (mIoU)
+ * Frequency Weighted IoU (fwIoU)
+ * Mean pixel accuracy averaged across classes (mACC)
+ * Pixel Accuracy (pACC)
+ """
+ if self._distributed:
+ synchronize()
+ conf_matrix_list = all_gather(self._conf_matrix)
+ b_conf_matrix_list = all_gather(self._b_conf_matrix)
+ self._predictions = all_gather(self._predictions)
+ self._predictions = list(itertools.chain(*self._predictions))
+ if not is_main_process():
+ return
+
+ self._conf_matrix = np.zeros_like(self._conf_matrix)
+ for conf_matrix in conf_matrix_list:
+ self._conf_matrix += conf_matrix
+
+ self._b_conf_matrix = np.zeros_like(self._b_conf_matrix)
+ for b_conf_matrix in b_conf_matrix_list:
+ self._b_conf_matrix += b_conf_matrix
+
+ if self._output_dir:
+ PathManager.mkdirs(self._output_dir)
+ file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+ with PathManager.open(file_path, "w") as f:
+ f.write(json.dumps(self._predictions))
+
+ acc = np.full(self._num_classes, np.nan, dtype=np.float)
+ iou = np.full(self._num_classes, np.nan, dtype=np.float)
+ tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
+ pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
+ class_weights = pos_gt / np.sum(pos_gt)
+ pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
+ acc_valid = pos_gt > 0
+ acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+ union = pos_gt + pos_pred - tp
+ iou_valid = np.logical_and(acc_valid, union > 0)
+ iou[iou_valid] = tp[iou_valid] / union[iou_valid]
+ macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
+ miou = np.sum(iou[iou_valid]) / np.sum(iou_valid)
+ fiou = np.sum(iou[iou_valid] * class_weights[iou_valid])
+ pacc = np.sum(tp) / np.sum(pos_gt)
+
+ if self._compute_boundary_iou:
+ b_iou = np.full(self._num_classes, np.nan, dtype=np.float)
+ b_tp = self._b_conf_matrix.diagonal()[:-1].astype(np.float)
+ b_pos_gt = np.sum(self._b_conf_matrix[:-1, :-1], axis=0).astype(np.float)
+ b_pos_pred = np.sum(self._b_conf_matrix[:-1, :-1], axis=1).astype(np.float)
+ b_union = b_pos_gt + b_pos_pred - b_tp
+ b_iou_valid = b_union > 0
+ b_iou[b_iou_valid] = b_tp[b_iou_valid] / b_union[b_iou_valid]
+
+ res = {}
+ res["mIoU"] = 100 * miou
+ res["fwIoU"] = 100 * fiou
+ for i, name in enumerate(self._class_names):
+ res[f"IoU-{name}"] = 100 * iou[i]
+ if self._compute_boundary_iou:
+ res[f"BoundaryIoU-{name}"] = 100 * b_iou[i]
+ res[f"min(IoU, B-Iou)-{name}"] = 100 * min(iou[i], b_iou[i])
+ res["mACC"] = 100 * macc
+ res["pACC"] = 100 * pacc
+ for i, name in enumerate(self._class_names):
+ res[f"ACC-{name}"] = 100 * acc[i]
+
+ if self._output_dir:
+ file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+ with PathManager.open(file_path, "wb") as f:
+ torch.save(res, f)
+ results = OrderedDict({"sem_seg": res})
+ self._logger.info(results)
+ return results
+
+ def encode_json_sem_seg(self, sem_seg, input_file_name):
+ """
+ Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
+ See http://cocodataset.org/#format-results
+ """
+ json_list = []
+ for label in np.unique(sem_seg):
+ if self._contiguous_id_to_dataset_id is not None:
+ assert (
+ label in self._contiguous_id_to_dataset_id
+ ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
+ dataset_id = self._contiguous_id_to_dataset_id[label]
+ else:
+ dataset_id = int(label)
+ mask = (sem_seg == label).astype(np.uint8)
+ mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
+ mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
+ json_list.append(
+ {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
+ )
+ return json_list
+
+ def _mask_to_boundary(self, mask: np.ndarray, dilation_ratio=0.02):
+ assert mask.ndim == 2, "mask_to_boundary expects a 2-dimensional image"
+ h, w = mask.shape
+ diag_len = np.sqrt(h**2 + w**2)
+ dilation = max(1, int(round(dilation_ratio * diag_len)))
+ kernel = np.ones((3, 3), dtype=np.uint8)
+
+ padded_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
+ eroded_mask_with_padding = cv2.erode(padded_mask, kernel, iterations=dilation)
+ eroded_mask = eroded_mask_with_padding[1:-1, 1:-1]
+ boundary = mask - eroded_mask
+ return boundary
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/evaluation/testing.py b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3869139b64a6ed2e28e356e1cd6fe22349345ca
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/evaluation/testing.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import pprint
+import sys
+from collections.abc import Mapping
+
+
+def print_csv_format(results):
+ """
+ Print main metrics in a format similar to Detectron,
+ so that they are easy to copypaste into a spreadsheet.
+
+ Args:
+ results (OrderedDict[dict]): task_name -> {metric -> score}
+ unordered dict can also be printed, but in arbitrary order
+ """
+ assert isinstance(results, Mapping) or not len(results), results
+ logger = logging.getLogger(__name__)
+ for task, res in results.items():
+ if isinstance(res, Mapping):
+ # Don't print "AP-category" metrics since they are usually not tracked.
+ important_res = [(k, v) for k, v in res.items() if "-" not in k]
+ logger.info("copypaste: Task: {}".format(task))
+ logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
+ logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
+ else:
+ logger.info(f"copypaste: {task}={res}")
+
+
+def verify_results(cfg, results):
+ """
+ Args:
+ results (OrderedDict[dict]): task_name -> {metric -> score}
+
+ Returns:
+ bool: whether the verification succeeds or not
+ """
+ expected_results = cfg.TEST.EXPECTED_RESULTS
+ if not len(expected_results):
+ return True
+
+ ok = True
+ for task, metric, expected, tolerance in expected_results:
+ actual = results[task].get(metric, None)
+ if actual is None:
+ ok = False
+ continue
+ if not np.isfinite(actual):
+ ok = False
+ continue
+ diff = abs(actual - expected)
+ if diff > tolerance:
+ ok = False
+
+ logger = logging.getLogger(__name__)
+ if not ok:
+ logger.error("Result verification failed!")
+ logger.error("Expected Results: " + str(expected_results))
+ logger.error("Actual Results: " + pprint.pformat(results))
+
+ sys.exit(1)
+ else:
+ logger.info("Results verification passed.")
+ return ok
+
+
+def flatten_results_dict(results):
+ """
+ Expand a hierarchical dict of scalars into a flat dict of scalars.
+ If results[k1][k2][k3] = v, the returned dict will have the entry
+ {"k1/k2/k3": v}.
+
+ Args:
+ results (dict):
+ """
+ r = {}
+ for k, v in results.items():
+ if isinstance(v, Mapping):
+ v = flatten_results_dict(v)
+ for kk, vv in v.items():
+ r[k + "/" + kk] = vv
+ else:
+ r[k] = v
+ return r
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/README.md b/comfyui_controlnet_aux/src/custom_detectron2/export/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..23d26ae2140fc20fc3b16b80b7c2954838ea3ea6
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/README.md
@@ -0,0 +1,15 @@
+
+This directory contains code to prepare a detectron2 model for deployment.
+Currently it supports exporting a detectron2 model to TorchScript, ONNX, or (deprecated) Caffe2 format.
+
+Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage.
+
+
+### Acknowledgements
+
+Thanks to Mobile Vision team at Facebook for developing the Caffe2 conversion tools.
+
+Thanks to Computing Platform Department - PAI team at Alibaba Group (@bddpqq, @chenbohua3) who
+help export Detectron2 models to TorchScript.
+
+Thanks to ONNX Converter team at Microsoft who help export Detectron2 models to ONNX.
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/export/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9a5f2e8562f2e4966c1e27ddafafa9abc934ede
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+
+from .flatten import TracingAdapter
+from .torchscript import dump_torchscript_IR, scripting_with_instances
+
+try:
+ from caffe2.proto import caffe2_pb2 as _tmp
+ from caffe2.python import core
+
+ # caffe2 is optional
+except ImportError:
+ pass
+else:
+ from .api import *
+
+
+# TODO: Update ONNX Opset version and run tests when a newer PyTorch is supported
+STABLE_ONNX_OPSET_VERSION = 11
+
+
+def add_export_config(cfg):
+ warnings.warn(
+ "add_export_config has been deprecated and behaves as no-op function.", DeprecationWarning
+ )
+ return cfg
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/api.py b/comfyui_controlnet_aux/src/custom_detectron2/export/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2006e68cbe98df26e13fd141776cd2eef434cf9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/api.py
@@ -0,0 +1,230 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import os
+import torch
+from caffe2.proto import caffe2_pb2
+from torch import nn
+
+from custom_detectron2.config import CfgNode
+from custom_detectron2.utils.file_io import PathManager
+
+from .caffe2_inference import ProtobufDetectionModel
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph
+
+__all__ = [
+ "Caffe2Model",
+ "Caffe2Tracer",
+]
+
+
+class Caffe2Tracer:
+ """
+ Make a detectron2 model traceable with Caffe2 operators.
+ This class creates a traceable version of a detectron2 model which:
+
+ 1. Rewrite parts of the model using ops in Caffe2. Note that some ops do
+ not have GPU implementation in Caffe2.
+ 2. Remove post-processing and only produce raw layer outputs
+
+ After making a traceable model, the class provide methods to export such a
+ model to different deployment formats.
+ Exported graph produced by this class take two input tensors:
+
+ 1. (1, C, H, W) float "data" which is an image (usually in [0, 255]).
+ (H, W) often has to be padded to multiple of 32 (depend on the model
+ architecture).
+ 2. 1x3 float "im_info", each row of which is (height, width, 1.0).
+ Height and width are true image shapes before padding.
+
+ The class currently only supports models using builtin meta architectures.
+ Batch inference is not supported, and contributions are welcome.
+ """
+
+ def __init__(self, cfg: CfgNode, model: nn.Module, inputs):
+ """
+ Args:
+ cfg (CfgNode): a detectron2 config used to construct caffe2-compatible model.
+ model (nn.Module): An original pytorch model. Must be among a few official models
+ in detectron2 that can be converted to become caffe2-compatible automatically.
+ Weights have to be already loaded to this model.
+ inputs: sample inputs that the given model takes for inference.
+ Will be used to trace the model. For most models, random inputs with
+ no detected objects will not work as they lead to wrong traces.
+ """
+ assert isinstance(cfg, CfgNode), cfg
+ assert isinstance(model, torch.nn.Module), type(model)
+
+ # TODO make it support custom models, by passing in c2 model directly
+ C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE]
+ self.traceable_model = C2MetaArch(cfg, copy.deepcopy(model))
+ self.inputs = inputs
+ self.traceable_inputs = self.traceable_model.get_caffe2_inputs(inputs)
+
+ def export_caffe2(self):
+ """
+ Export the model to Caffe2's protobuf format.
+ The returned object can be saved with its :meth:`.save_protobuf()` method.
+ The result can be loaded and executed using Caffe2 runtime.
+
+ Returns:
+ :class:`Caffe2Model`
+ """
+ from .caffe2_export import export_caffe2_detection_model
+
+ predict_net, init_net = export_caffe2_detection_model(
+ self.traceable_model, self.traceable_inputs
+ )
+ return Caffe2Model(predict_net, init_net)
+
+ def export_onnx(self):
+ """
+ Export the model to ONNX format.
+ Note that the exported model contains custom ops only available in caffe2, therefore it
+ cannot be directly executed by other runtime (such as onnxruntime or TensorRT).
+ Post-processing or transformation passes may be applied on the model to accommodate
+ different runtimes, but we currently do not provide support for them.
+
+ Returns:
+ onnx.ModelProto: an onnx model.
+ """
+ from .caffe2_export import export_onnx_model as export_onnx_model_impl
+
+ return export_onnx_model_impl(self.traceable_model, (self.traceable_inputs,))
+
+ def export_torchscript(self):
+ """
+ Export the model to a ``torch.jit.TracedModule`` by tracing.
+ The returned object can be saved to a file by ``.save()``.
+
+ Returns:
+ torch.jit.TracedModule: a torch TracedModule
+ """
+ logger = logging.getLogger(__name__)
+ logger.info("Tracing the model with torch.jit.trace ...")
+ with torch.no_grad():
+ return torch.jit.trace(self.traceable_model, (self.traceable_inputs,))
+
+
+class Caffe2Model(nn.Module):
+ """
+ A wrapper around the traced model in Caffe2's protobuf format.
+ The exported graph has different inputs/outputs from the original Pytorch
+ model, as explained in :class:`Caffe2Tracer`. This class wraps around the
+ exported graph to simulate the same interface as the original Pytorch model.
+ It also provides functions to save/load models in Caffe2's format.'
+
+ Examples:
+ ::
+ c2_model = Caffe2Tracer(cfg, torch_model, inputs).export_caffe2()
+ inputs = [{"image": img_tensor_CHW}]
+ outputs = c2_model(inputs)
+ orig_outputs = torch_model(inputs)
+ """
+
+ def __init__(self, predict_net, init_net):
+ super().__init__()
+ self.eval() # always in eval mode
+ self._predict_net = predict_net
+ self._init_net = init_net
+ self._predictor = None
+
+ __init__.__HIDE_SPHINX_DOC__ = True
+
+ @property
+ def predict_net(self):
+ """
+ caffe2.core.Net: the underlying caffe2 predict net
+ """
+ return self._predict_net
+
+ @property
+ def init_net(self):
+ """
+ caffe2.core.Net: the underlying caffe2 init net
+ """
+ return self._init_net
+
+ def save_protobuf(self, output_dir):
+ """
+ Save the model as caffe2's protobuf format.
+ It saves the following files:
+
+ * "model.pb": definition of the graph. Can be visualized with
+ tools like `netron `_.
+ * "model_init.pb": model parameters
+ * "model.pbtxt": human-readable definition of the graph. Not
+ needed for deployment.
+
+ Args:
+ output_dir (str): the output directory to save protobuf files.
+ """
+ logger = logging.getLogger(__name__)
+ logger.info("Saving model to {} ...".format(output_dir))
+ if not PathManager.exists(output_dir):
+ PathManager.mkdirs(output_dir)
+
+ with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f:
+ f.write(self._predict_net.SerializeToString())
+ with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f:
+ f.write(str(self._predict_net))
+ with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f:
+ f.write(self._init_net.SerializeToString())
+
+ def save_graph(self, output_file, inputs=None):
+ """
+ Save the graph as SVG format.
+
+ Args:
+ output_file (str): a SVG file
+ inputs: optional inputs given to the model.
+ If given, the inputs will be used to run the graph to record
+ shape of every tensor. The shape information will be
+ saved together with the graph.
+ """
+ from .caffe2_export import run_and_save_graph
+
+ if inputs is None:
+ save_graph(self._predict_net, output_file, op_only=False)
+ else:
+ size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0)
+ device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii")
+ inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device)
+ inputs = [x.cpu().numpy() for x in inputs]
+ run_and_save_graph(self._predict_net, self._init_net, inputs, output_file)
+
+ @staticmethod
+ def load_protobuf(dir):
+ """
+ Args:
+ dir (str): a directory used to save Caffe2Model with
+ :meth:`save_protobuf`.
+ The files "model.pb" and "model_init.pb" are needed.
+
+ Returns:
+ Caffe2Model: the caffe2 model loaded from this directory.
+ """
+ predict_net = caffe2_pb2.NetDef()
+ with PathManager.open(os.path.join(dir, "model.pb"), "rb") as f:
+ predict_net.ParseFromString(f.read())
+
+ init_net = caffe2_pb2.NetDef()
+ with PathManager.open(os.path.join(dir, "model_init.pb"), "rb") as f:
+ init_net.ParseFromString(f.read())
+
+ return Caffe2Model(predict_net, init_net)
+
+ def __call__(self, inputs):
+ """
+ An interface that wraps around a Caffe2 model and mimics detectron2's models'
+ input/output format. See details about the format at :doc:`/tutorials/models`.
+ This is used to compare the outputs of caffe2 model with its original torch model.
+
+ Due to the extra conversion between Pytorch/Caffe2, this method is not meant for
+ benchmark. Because of the conversion, this method also has dependency
+ on detectron2 in order to convert to detectron2's output format.
+ """
+ if self._predictor is None:
+ self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net)
+ return self._predictor(inputs)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/c10.py b/comfyui_controlnet_aux/src/custom_detectron2/export/c10.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a9f38f2a02f92665f852fa178f8b1eabff51cb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/c10.py
@@ -0,0 +1,557 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import math
+from typing import Dict
+import torch
+import torch.nn.functional as F
+
+from custom_detectron2.layers import ShapeSpec, cat
+from custom_detectron2.layers.roi_align_rotated import ROIAlignRotated
+from custom_detectron2.modeling import poolers
+from custom_detectron2.modeling.proposal_generator import rpn
+from custom_detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference
+from custom_detectron2.structures import Boxes, ImageList, Instances, Keypoints, RotatedBoxes
+
+from .shared import alias, to_device
+
+
+"""
+This file contains caffe2-compatible implementation of several detectron2 components.
+"""
+
+
+class Caffe2Boxes(Boxes):
+ """
+ Representing a list of detectron2.structures.Boxes from minibatch, each box
+ is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector
+ (batch index + 5 coordinates) for RotatedBoxes.
+ """
+
+ def __init__(self, tensor):
+ assert isinstance(tensor, torch.Tensor)
+ assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size()
+ # TODO: make tensor immutable when dim is Nx5 for Boxes,
+ # and Nx6 for RotatedBoxes?
+ self.tensor = tensor
+
+
+# TODO clean up this class, maybe just extend Instances
+class InstancesList(object):
+ """
+ Tensor representation of a list of Instances object for a batch of images.
+
+ When dealing with a batch of images with Caffe2 ops, a list of bboxes
+ (instances) are usually represented by single Tensor with size
+ (sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is
+ for providing common functions to convert between these two representations.
+ """
+
+ def __init__(self, im_info, indices, extra_fields=None):
+ # [N, 3] -> (H, W, Scale)
+ self.im_info = im_info
+ # [N,] -> indice of batch to which the instance belongs
+ self.indices = indices
+ # [N, ...]
+ self.batch_extra_fields = extra_fields or {}
+
+ self.image_size = self.im_info
+
+ def get_fields(self):
+ """like `get_fields` in the Instances object,
+ but return each field in tensor representations"""
+ ret = {}
+ for k, v in self.batch_extra_fields.items():
+ # if isinstance(v, torch.Tensor):
+ # tensor_rep = v
+ # elif isinstance(v, (Boxes, Keypoints)):
+ # tensor_rep = v.tensor
+ # else:
+ # raise ValueError("Can't find tensor representation for: {}".format())
+ ret[k] = v
+ return ret
+
+ def has(self, name):
+ return name in self.batch_extra_fields
+
+ def set(self, name, value):
+ # len(tensor) is a bad practice that generates ONNX constants during tracing.
+ # Although not a problem for the `assert` statement below, torch ONNX exporter
+ # still raises a misleading warning as it does not this call comes from `assert`
+ if isinstance(value, Boxes):
+ data_len = value.tensor.shape[0]
+ elif isinstance(value, torch.Tensor):
+ data_len = value.shape[0]
+ else:
+ data_len = len(value)
+ if len(self.batch_extra_fields):
+ assert (
+ len(self) == data_len
+ ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+ self.batch_extra_fields[name] = value
+
+ def __getattr__(self, name):
+ if name not in self.batch_extra_fields:
+ raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+ return self.batch_extra_fields[name]
+
+ def __len__(self):
+ return len(self.indices)
+
+ def flatten(self):
+ ret = []
+ for _, v in self.batch_extra_fields.items():
+ if isinstance(v, (Boxes, Keypoints)):
+ ret.append(v.tensor)
+ else:
+ ret.append(v)
+ return ret
+
+ @staticmethod
+ def to_d2_instances_list(instances_list):
+ """
+ Convert InstancesList to List[Instances]. The input `instances_list` can
+ also be a List[Instances], in this case this method is a non-op.
+ """
+ if not isinstance(instances_list, InstancesList):
+ assert all(isinstance(x, Instances) for x in instances_list)
+ return instances_list
+
+ ret = []
+ for i, info in enumerate(instances_list.im_info):
+ instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())]))
+
+ ids = instances_list.indices == i
+ for k, v in instances_list.batch_extra_fields.items():
+ if isinstance(v, torch.Tensor):
+ instances.set(k, v[ids])
+ continue
+ elif isinstance(v, Boxes):
+ instances.set(k, v[ids, -4:])
+ continue
+
+ target_type, tensor_source = v
+ assert isinstance(tensor_source, torch.Tensor)
+ assert tensor_source.shape[0] == instances_list.indices.shape[0]
+ tensor_source = tensor_source[ids]
+
+ if issubclass(target_type, Boxes):
+ instances.set(k, Boxes(tensor_source[:, -4:]))
+ elif issubclass(target_type, Keypoints):
+ instances.set(k, Keypoints(tensor_source))
+ elif issubclass(target_type, torch.Tensor):
+ instances.set(k, tensor_source)
+ else:
+ raise ValueError("Can't handle targe type: {}".format(target_type))
+
+ ret.append(instances)
+ return ret
+
+
+class Caffe2Compatible(object):
+ """
+ A model can inherit this class to indicate that it can be traced and deployed with caffe2.
+ """
+
+ def _get_tensor_mode(self):
+ return self._tensor_mode
+
+ def _set_tensor_mode(self, v):
+ self._tensor_mode = v
+
+ tensor_mode = property(_get_tensor_mode, _set_tensor_mode)
+ """
+ If true, the model expects C2-style tensor only inputs/outputs format.
+ """
+
+
+class Caffe2RPN(Caffe2Compatible, rpn.RPN):
+ @classmethod
+ def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+ ret = super(Caffe2Compatible, cls).from_config(cfg, input_shape)
+ assert tuple(cfg.MODEL.RPN.BBOX_REG_WEIGHTS) == (1.0, 1.0, 1.0, 1.0) or tuple(
+ cfg.MODEL.RPN.BBOX_REG_WEIGHTS
+ ) == (1.0, 1.0, 1.0, 1.0, 1.0)
+ return ret
+
+ def _generate_proposals(
+ self, images, objectness_logits_pred, anchor_deltas_pred, gt_instances=None
+ ):
+ assert isinstance(images, ImageList)
+ if self.tensor_mode:
+ im_info = images.image_sizes
+ else:
+ im_info = torch.tensor([[im_sz[0], im_sz[1], 1.0] for im_sz in images.image_sizes]).to(
+ images.tensor.device
+ )
+ assert isinstance(im_info, torch.Tensor)
+
+ rpn_rois_list = []
+ rpn_roi_probs_list = []
+ for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip(
+ objectness_logits_pred,
+ anchor_deltas_pred,
+ [b for (n, b) in self.anchor_generator.cell_anchors.named_buffers()],
+ self.anchor_generator.strides,
+ ):
+ scores = scores.detach()
+ bbox_deltas = bbox_deltas.detach()
+
+ rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals(
+ scores,
+ bbox_deltas,
+ im_info,
+ cell_anchors_tensor,
+ spatial_scale=1.0 / feat_stride,
+ pre_nms_topN=self.pre_nms_topk[self.training],
+ post_nms_topN=self.post_nms_topk[self.training],
+ nms_thresh=self.nms_thresh,
+ min_size=self.min_box_size,
+ # correct_transform_coords=True, # deprecated argument
+ angle_bound_on=True, # Default
+ angle_bound_lo=-180,
+ angle_bound_hi=180,
+ clip_angle_thresh=1.0, # Default
+ legacy_plus_one=False,
+ )
+ rpn_rois_list.append(rpn_rois)
+ rpn_roi_probs_list.append(rpn_roi_probs)
+
+ # For FPN in D2, in RPN all proposals from different levels are concated
+ # together, ranked and picked by top post_nms_topk. Then in ROIPooler
+ # it calculates level_assignments and calls the RoIAlign from
+ # the corresponding level.
+
+ if len(objectness_logits_pred) == 1:
+ rpn_rois = rpn_rois_list[0]
+ rpn_roi_probs = rpn_roi_probs_list[0]
+ else:
+ assert len(rpn_rois_list) == len(rpn_roi_probs_list)
+ rpn_post_nms_topN = self.post_nms_topk[self.training]
+
+ device = rpn_rois_list[0].device
+ input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)]
+
+ # TODO remove this after confirming rpn_max_level/rpn_min_level
+ # is not needed in CollectRpnProposals.
+ feature_strides = list(self.anchor_generator.strides)
+ rpn_min_level = int(math.log2(feature_strides[0]))
+ rpn_max_level = int(math.log2(feature_strides[-1]))
+ assert (rpn_max_level - rpn_min_level + 1) == len(
+ rpn_rois_list
+ ), "CollectRpnProposals requires continuous levels"
+
+ rpn_rois = torch.ops._caffe2.CollectRpnProposals(
+ input_list,
+ # NOTE: in current implementation, rpn_max_level and rpn_min_level
+ # are not needed, only the subtraction of two matters and it
+ # can be infer from the number of inputs. Keep them now for
+ # consistency.
+ rpn_max_level=2 + len(rpn_rois_list) - 1,
+ rpn_min_level=2,
+ rpn_post_nms_topN=rpn_post_nms_topN,
+ )
+ rpn_rois = to_device(rpn_rois, device)
+ rpn_roi_probs = []
+
+ proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode)
+ return proposals, {}
+
+ def forward(self, images, features, gt_instances=None):
+ assert not self.training
+ features = [features[f] for f in self.in_features]
+ objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features)
+ return self._generate_proposals(
+ images,
+ objectness_logits_pred,
+ anchor_deltas_pred,
+ gt_instances,
+ )
+
+ @staticmethod
+ def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode):
+ proposals = InstancesList(
+ im_info=im_info,
+ indices=rpn_rois[:, 0],
+ extra_fields={
+ "proposal_boxes": Caffe2Boxes(rpn_rois),
+ "objectness_logits": (torch.Tensor, rpn_roi_probs),
+ },
+ )
+ if not tensor_mode:
+ proposals = InstancesList.to_d2_instances_list(proposals)
+ else:
+ proposals = [proposals]
+ return proposals
+
+
+class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler):
+ @staticmethod
+ def c2_preprocess(box_lists):
+ assert all(isinstance(x, Boxes) for x in box_lists)
+ if all(isinstance(x, Caffe2Boxes) for x in box_lists):
+ # input is pure-tensor based
+ assert len(box_lists) == 1
+ pooler_fmt_boxes = box_lists[0].tensor
+ else:
+ pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists)
+ return pooler_fmt_boxes
+
+ def forward(self, x, box_lists):
+ assert not self.training
+
+ pooler_fmt_boxes = self.c2_preprocess(box_lists)
+ num_level_assignments = len(self.level_poolers)
+
+ if num_level_assignments == 1:
+ if isinstance(self.level_poolers[0], ROIAlignRotated):
+ c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+ aligned = True
+ else:
+ c2_roi_align = torch.ops._caffe2.RoIAlign
+ aligned = self.level_poolers[0].aligned
+
+ x0 = x[0]
+ if x0.is_quantized:
+ x0 = x0.dequantize()
+
+ out = c2_roi_align(
+ x0,
+ pooler_fmt_boxes,
+ order="NCHW",
+ spatial_scale=float(self.level_poolers[0].spatial_scale),
+ pooled_h=int(self.output_size[0]),
+ pooled_w=int(self.output_size[1]),
+ sampling_ratio=int(self.level_poolers[0].sampling_ratio),
+ aligned=aligned,
+ )
+ return out
+
+ device = pooler_fmt_boxes.device
+ assert (
+ self.max_level - self.min_level + 1 == 4
+ ), "Currently DistributeFpnProposals only support 4 levels"
+ fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
+ to_device(pooler_fmt_boxes, "cpu"),
+ roi_canonical_scale=self.canonical_box_size,
+ roi_canonical_level=self.canonical_level,
+ roi_max_level=self.max_level,
+ roi_min_level=self.min_level,
+ legacy_plus_one=False,
+ )
+ fpn_outputs = [to_device(x, device) for x in fpn_outputs]
+
+ rois_fpn_list = fpn_outputs[:-1]
+ rois_idx_restore_int32 = fpn_outputs[-1]
+
+ roi_feat_fpn_list = []
+ for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers):
+ if isinstance(pooler, ROIAlignRotated):
+ c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+ aligned = True
+ else:
+ c2_roi_align = torch.ops._caffe2.RoIAlign
+ aligned = bool(pooler.aligned)
+
+ if x_level.is_quantized:
+ x_level = x_level.dequantize()
+
+ roi_feat_fpn = c2_roi_align(
+ x_level,
+ roi_fpn,
+ order="NCHW",
+ spatial_scale=float(pooler.spatial_scale),
+ pooled_h=int(self.output_size[0]),
+ pooled_w=int(self.output_size[1]),
+ sampling_ratio=int(pooler.sampling_ratio),
+ aligned=aligned,
+ )
+ roi_feat_fpn_list.append(roi_feat_fpn)
+
+ roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
+ assert roi_feat_shuffled.numel() > 0 and rois_idx_restore_int32.numel() > 0, (
+ "Caffe2 export requires tracing with a model checkpoint + input that can produce valid"
+ " detections. But no detections were obtained with the given checkpoint and input!"
+ )
+ roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32)
+ return roi_feat
+
+
+class Caffe2FastRCNNOutputsInference:
+ def __init__(self, tensor_mode):
+ self.tensor_mode = tensor_mode # whether the output is caffe2 tensor mode
+
+ def __call__(self, box_predictor, predictions, proposals):
+ """equivalent to FastRCNNOutputLayers.inference"""
+ num_classes = box_predictor.num_classes
+ score_thresh = box_predictor.test_score_thresh
+ nms_thresh = box_predictor.test_nms_thresh
+ topk_per_image = box_predictor.test_topk_per_image
+ is_rotated = len(box_predictor.box2box_transform.weights) == 5
+
+ if is_rotated:
+ box_dim = 5
+ assert box_predictor.box2box_transform.weights[4] == 1, (
+ "The weights for Rotated BBoxTransform in C2 have only 4 dimensions,"
+ + " thus enforcing the angle weight to be 1 for now"
+ )
+ box2box_transform_weights = box_predictor.box2box_transform.weights[:4]
+ else:
+ box_dim = 4
+ box2box_transform_weights = box_predictor.box2box_transform.weights
+
+ class_logits, box_regression = predictions
+ if num_classes + 1 == class_logits.shape[1]:
+ class_prob = F.softmax(class_logits, -1)
+ else:
+ assert num_classes == class_logits.shape[1]
+ class_prob = F.sigmoid(class_logits)
+ # BoxWithNMSLimit will infer num_classes from the shape of the class_prob
+ # So append a zero column as placeholder for the background class
+ class_prob = torch.cat((class_prob, torch.zeros(class_prob.shape[0], 1)), dim=1)
+
+ assert box_regression.shape[1] % box_dim == 0
+ cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1
+
+ input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1
+
+ proposal_boxes = proposals[0].proposal_boxes
+ if isinstance(proposal_boxes, Caffe2Boxes):
+ rois = Caffe2Boxes.cat([p.proposal_boxes for p in proposals])
+ elif isinstance(proposal_boxes, RotatedBoxes):
+ rois = RotatedBoxes.cat([p.proposal_boxes for p in proposals])
+ elif isinstance(proposal_boxes, Boxes):
+ rois = Boxes.cat([p.proposal_boxes for p in proposals])
+ else:
+ raise NotImplementedError(
+ 'Expected proposals[0].proposal_boxes to be type "Boxes", '
+ f"instead got {type(proposal_boxes)}"
+ )
+
+ device, dtype = rois.tensor.device, rois.tensor.dtype
+ if input_tensor_mode:
+ im_info = proposals[0].image_size
+ rois = rois.tensor
+ else:
+ im_info = torch.tensor(
+ [[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]]
+ )
+ batch_ids = cat(
+ [
+ torch.full((b, 1), i, dtype=dtype, device=device)
+ for i, b in enumerate(len(p) for p in proposals)
+ ],
+ dim=0,
+ )
+ rois = torch.cat([batch_ids, rois.tensor], dim=1)
+
+ roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform(
+ to_device(rois, "cpu"),
+ to_device(box_regression, "cpu"),
+ to_device(im_info, "cpu"),
+ weights=box2box_transform_weights,
+ apply_scale=True,
+ rotated=is_rotated,
+ angle_bound_on=True,
+ angle_bound_lo=-180,
+ angle_bound_hi=180,
+ clip_angle_thresh=1.0,
+ legacy_plus_one=False,
+ )
+ roi_pred_bbox = to_device(roi_pred_bbox, device)
+ roi_batch_splits = to_device(roi_batch_splits, device)
+
+ nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
+ to_device(class_prob, "cpu"),
+ to_device(roi_pred_bbox, "cpu"),
+ to_device(roi_batch_splits, "cpu"),
+ score_thresh=float(score_thresh),
+ nms=float(nms_thresh),
+ detections_per_im=int(topk_per_image),
+ soft_nms_enabled=False,
+ soft_nms_method="linear",
+ soft_nms_sigma=0.5,
+ soft_nms_min_score_thres=0.001,
+ rotated=is_rotated,
+ cls_agnostic_bbox_reg=cls_agnostic_bbox_reg,
+ input_boxes_include_bg_cls=False,
+ output_classes_include_bg_cls=False,
+ legacy_plus_one=False,
+ )
+ roi_score_nms = to_device(nms_outputs[0], device)
+ roi_bbox_nms = to_device(nms_outputs[1], device)
+ roi_class_nms = to_device(nms_outputs[2], device)
+ roi_batch_splits_nms = to_device(nms_outputs[3], device)
+ roi_keeps_nms = to_device(nms_outputs[4], device)
+ roi_keeps_size_nms = to_device(nms_outputs[5], device)
+ if not self.tensor_mode:
+ roi_class_nms = roi_class_nms.to(torch.int64)
+
+ roi_batch_ids = cat(
+ [
+ torch.full((b, 1), i, dtype=dtype, device=device)
+ for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms)
+ ],
+ dim=0,
+ )
+
+ roi_class_nms = alias(roi_class_nms, "class_nms")
+ roi_score_nms = alias(roi_score_nms, "score_nms")
+ roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms")
+ roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms")
+ roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms")
+ roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms")
+
+ results = InstancesList(
+ im_info=im_info,
+ indices=roi_batch_ids[:, 0],
+ extra_fields={
+ "pred_boxes": Caffe2Boxes(roi_bbox_nms),
+ "scores": roi_score_nms,
+ "pred_classes": roi_class_nms,
+ },
+ )
+
+ if not self.tensor_mode:
+ results = InstancesList.to_d2_instances_list(results)
+ batch_splits = roi_batch_splits_nms.int().tolist()
+ kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits))
+ else:
+ results = [results]
+ kept_indices = [roi_keeps_nms]
+
+ return results, kept_indices
+
+
+class Caffe2MaskRCNNInference:
+ def __call__(self, pred_mask_logits, pred_instances):
+ """equivalent to mask_head.mask_rcnn_inference"""
+ if all(isinstance(x, InstancesList) for x in pred_instances):
+ assert len(pred_instances) == 1
+ mask_probs_pred = pred_mask_logits.sigmoid()
+ mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs")
+ pred_instances[0].set("pred_masks", mask_probs_pred)
+ else:
+ mask_rcnn_inference(pred_mask_logits, pred_instances)
+
+
+class Caffe2KeypointRCNNInference:
+ def __init__(self, use_heatmap_max_keypoint):
+ self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+
+ def __call__(self, pred_keypoint_logits, pred_instances):
+ # just return the keypoint heatmap for now,
+ # there will be option to call HeatmapMaxKeypointOp
+ output = alias(pred_keypoint_logits, "kps_score")
+ if all(isinstance(x, InstancesList) for x in pred_instances):
+ assert len(pred_instances) == 1
+ if self.use_heatmap_max_keypoint:
+ device = output.device
+ output = torch.ops._caffe2.HeatmapMaxKeypoint(
+ to_device(output, "cpu"),
+ pred_instances[0].pred_boxes.tensor,
+ should_output_softmax=True, # worth make it configerable?
+ )
+ output = to_device(output, device)
+ output = alias(output, "keypoints_out")
+ pred_instances[0].set("pred_keypoints", output)
+ return pred_keypoint_logits
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_export.py b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..df54131a76ef94fa9a09a9c94565fc2cb6f73492
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_export.py
@@ -0,0 +1,203 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import copy
+import io
+import logging
+import numpy as np
+from typing import List
+import onnx
+import onnx.optimizer
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from caffe2.python.onnx.backend import Caffe2Backend
+from tabulate import tabulate
+from termcolor import colored
+from torch.onnx import OperatorExportTypes
+
+from .shared import (
+ ScopedWS,
+ construct_init_net_from_params,
+ fuse_alias_placeholder,
+ fuse_copy_between_cpu_and_gpu,
+ get_params_from_init_net,
+ group_norm_replace_aten_with_caffe2,
+ infer_device_type,
+ remove_dead_end_ops,
+ remove_reshape_for_fc,
+ save_graph,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def export_onnx_model(model, inputs):
+ """
+ Trace and export a model to onnx format.
+
+ Args:
+ model (nn.Module):
+ inputs (tuple[args]): the model will be called by `model(*inputs)`
+
+ Returns:
+ an onnx model
+ """
+ assert isinstance(model, torch.nn.Module)
+
+ # make sure all modules are in eval mode, onnx may change the training state
+ # of the module if the states are not consistent
+ def _check_eval(module):
+ assert not module.training
+
+ model.apply(_check_eval)
+
+ # Export the model to ONNX
+ with torch.no_grad():
+ with io.BytesIO() as f:
+ torch.onnx.export(
+ model,
+ inputs,
+ f,
+ operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+ # verbose=True, # NOTE: uncomment this for debugging
+ # export_params=True,
+ )
+ onnx_model = onnx.load_from_string(f.getvalue())
+
+ return onnx_model
+
+
+def _op_stats(net_def):
+ type_count = {}
+ for t in [op.type for op in net_def.op]:
+ type_count[t] = type_count.get(t, 0) + 1
+ type_count_list = sorted(type_count.items(), key=lambda kv: kv[0]) # alphabet
+ type_count_list = sorted(type_count_list, key=lambda kv: -kv[1]) # count
+ return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list)
+
+
+def _assign_device_option(
+ predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor]
+):
+ """
+ ONNX exported network doesn't have concept of device, assign necessary
+ device option for each op in order to make it runable on GPU runtime.
+ """
+
+ def _get_device_type(torch_tensor):
+ assert torch_tensor.device.type in ["cpu", "cuda"]
+ assert torch_tensor.device.index == 0
+ return torch_tensor.device.type
+
+ def _assign_op_device_option(net_proto, net_ssa, blob_device_types):
+ for op, ssa_i in zip(net_proto.op, net_ssa):
+ if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]:
+ op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+ else:
+ devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]]
+ assert all(d == devices[0] for d in devices)
+ if devices[0] == "cuda":
+ op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+
+ # update ops in predict_net
+ predict_net_input_device_types = {
+ (name, 0): _get_device_type(tensor)
+ for name, tensor in zip(predict_net.external_input, tensor_inputs)
+ }
+ predict_net_device_types = infer_device_type(
+ predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch"
+ )
+ predict_net_ssa, _ = core.get_ssa(predict_net)
+ _assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types)
+
+ # update ops in init_net
+ init_net_ssa, versions = core.get_ssa(init_net)
+ init_net_output_device_types = {
+ (name, versions[name]): predict_net_device_types[(name, 0)]
+ for name in init_net.external_output
+ }
+ init_net_device_types = infer_device_type(
+ init_net, known_status=init_net_output_device_types, device_name_style="pytorch"
+ )
+ _assign_op_device_option(init_net, init_net_ssa, init_net_device_types)
+
+
+def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]):
+ """
+ Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX.
+
+ Arg:
+ model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py
+ tensor_inputs: a list of tensors that caffe2 model takes as input.
+ """
+ model = copy.deepcopy(model)
+ assert isinstance(model, torch.nn.Module)
+ assert hasattr(model, "encode_additional_info")
+
+ # Export via ONNX
+ logger.info(
+ "Exporting a {} model via ONNX ...".format(type(model).__name__)
+ + " Some warnings from ONNX are expected and are usually not to worry about."
+ )
+ onnx_model = export_onnx_model(model, (tensor_inputs,))
+ # Convert ONNX model to Caffe2 protobuf
+ init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
+ ops_table = [[op.type, op.input, op.output] for op in predict_net.op]
+ table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe")
+ logger.info(
+ "ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan")
+ )
+
+ # Apply protobuf optimization
+ fuse_alias_placeholder(predict_net, init_net)
+ if any(t.device.type != "cpu" for t in tensor_inputs):
+ fuse_copy_between_cpu_and_gpu(predict_net)
+ remove_dead_end_ops(init_net)
+ _assign_device_option(predict_net, init_net, tensor_inputs)
+ params, device_options = get_params_from_init_net(init_net)
+ predict_net, params = remove_reshape_for_fc(predict_net, params)
+ init_net = construct_init_net_from_params(params, device_options)
+ group_norm_replace_aten_with_caffe2(predict_net)
+
+ # Record necessary information for running the pb model in Detectron2 system.
+ model.encode_additional_info(predict_net, init_net)
+
+ logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net)))
+ logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net)))
+
+ return predict_net, init_net
+
+
+def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path):
+ """
+ Run the caffe2 model on given inputs, recording the shape and draw the graph.
+
+ predict_net/init_net: caffe2 model.
+ tensor_inputs: a list of tensors that caffe2 model takes as input.
+ graph_save_path: path for saving graph of exported model.
+ """
+
+ logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path))
+ save_graph(predict_net, graph_save_path, op_only=False)
+
+ # Run the exported Caffe2 net
+ logger.info("Running ONNX exported model ...")
+ with ScopedWS("__ws_tmp__", True) as ws:
+ ws.RunNetOnce(init_net)
+ initialized_blobs = set(ws.Blobs())
+ uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs]
+ for name, blob in zip(uninitialized, tensor_inputs):
+ ws.FeedBlob(name, blob)
+
+ try:
+ ws.RunNetOnce(predict_net)
+ except RuntimeError as e:
+ logger.warning("Encountered RuntimeError: \n{}".format(str(e)))
+
+ ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()}
+ blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)}
+
+ logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path))
+ save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes)
+
+ return ws_blobs
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_inference.py b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc528dab6091e6f7706d09c97b488becfb69ac69
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_inference.py
@@ -0,0 +1,161 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+import numpy as np
+from itertools import count
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type
+
+logger = logging.getLogger(__name__)
+
+
+# ===== ref: mobile-vision predictor's 'Caffe2Wrapper' class ======
+class ProtobufModel(torch.nn.Module):
+ """
+ Wrapper of a caffe2's protobuf model.
+ It works just like nn.Module, but running caffe2 under the hood.
+ Input/Output are tuple[tensor] that match the caffe2 net's external_input/output.
+ """
+
+ _ids = count(0)
+
+ def __init__(self, predict_net, init_net):
+ logger.info(f"Initializing ProtobufModel for: {predict_net.name} ...")
+ super().__init__()
+ assert isinstance(predict_net, caffe2_pb2.NetDef)
+ assert isinstance(init_net, caffe2_pb2.NetDef)
+ # create unique temporary workspace for each instance
+ self.ws_name = "__tmp_ProtobufModel_{}__".format(next(self._ids))
+ self.net = core.Net(predict_net)
+
+ logger.info("Running init_net once to fill the parameters ...")
+ with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws:
+ ws.RunNetOnce(init_net)
+ uninitialized_external_input = []
+ for blob in self.net.Proto().external_input:
+ if blob not in ws.Blobs():
+ uninitialized_external_input.append(blob)
+ ws.CreateBlob(blob)
+ ws.CreateNet(self.net)
+
+ self._error_msgs = set()
+ self._input_blobs = uninitialized_external_input
+
+ def _infer_output_devices(self, inputs):
+ """
+ Returns:
+ list[str]: list of device for each external output
+ """
+
+ def _get_device_type(torch_tensor):
+ assert torch_tensor.device.type in ["cpu", "cuda"]
+ assert torch_tensor.device.index == 0
+ return torch_tensor.device.type
+
+ predict_net = self.net.Proto()
+ input_device_types = {
+ (name, 0): _get_device_type(tensor) for name, tensor in zip(self._input_blobs, inputs)
+ }
+ device_type_map = infer_device_type(
+ predict_net, known_status=input_device_types, device_name_style="pytorch"
+ )
+ ssa, versions = core.get_ssa(predict_net)
+ versioned_outputs = [(name, versions[name]) for name in predict_net.external_output]
+ output_devices = [device_type_map[outp] for outp in versioned_outputs]
+ return output_devices
+
+ def forward(self, inputs):
+ """
+ Args:
+ inputs (tuple[torch.Tensor])
+
+ Returns:
+ tuple[torch.Tensor]
+ """
+ assert len(inputs) == len(self._input_blobs), (
+ f"Length of inputs ({len(inputs)}) "
+ f"doesn't match the required input blobs: {self._input_blobs}"
+ )
+
+ with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws:
+ for b, tensor in zip(self._input_blobs, inputs):
+ ws.FeedBlob(b, tensor)
+
+ try:
+ ws.RunNet(self.net.Proto().name)
+ except RuntimeError as e:
+ if not str(e) in self._error_msgs:
+ self._error_msgs.add(str(e))
+ logger.warning("Encountered new RuntimeError: \n{}".format(str(e)))
+ logger.warning("Catch the error and use partial results.")
+
+ c2_outputs = [ws.FetchBlob(b) for b in self.net.Proto().external_output]
+ # Remove outputs of current run, this is necessary in order to
+ # prevent fetching the result from previous run if the model fails
+ # in the middle.
+ for b in self.net.Proto().external_output:
+ # Needs to create uninitialized blob to make the net runable.
+ # This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b),
+ # but there'no such API.
+ ws.FeedBlob(b, f"{b}, a C++ native class of type nullptr (uninitialized).")
+
+ # Cast output to torch.Tensor on the desired device
+ output_devices = (
+ self._infer_output_devices(inputs)
+ if any(t.device.type != "cpu" for t in inputs)
+ else ["cpu" for _ in self.net.Proto().external_output]
+ )
+
+ outputs = []
+ for name, c2_output, device in zip(
+ self.net.Proto().external_output, c2_outputs, output_devices
+ ):
+ if not isinstance(c2_output, np.ndarray):
+ raise RuntimeError(
+ "Invalid output for blob {}, received: {}".format(name, c2_output)
+ )
+ outputs.append(torch.tensor(c2_output).to(device=device))
+ return tuple(outputs)
+
+
+class ProtobufDetectionModel(torch.nn.Module):
+ """
+ A class works just like a pytorch meta arch in terms of inference, but running
+ caffe2 model under the hood.
+ """
+
+ def __init__(self, predict_net, init_net, *, convert_outputs=None):
+ """
+ Args:
+ predict_net, init_net (core.Net): caffe2 nets
+ convert_outptus (callable): a function that converts caffe2
+ outputs to the same format of the original pytorch model.
+ By default, use the one defined in the caffe2 meta_arch.
+ """
+ super().__init__()
+ self.protobuf_model = ProtobufModel(predict_net, init_net)
+ self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0)
+ self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii")
+
+ if convert_outputs is None:
+ meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN")
+ meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")]
+ self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net)
+ else:
+ self._convert_outputs = convert_outputs
+
+ def _convert_inputs(self, batched_inputs):
+ # currently all models convert inputs in the same way
+ return convert_batched_inputs_to_c2_format(
+ batched_inputs, self.size_divisibility, self.device
+ )
+
+ def forward(self, batched_inputs):
+ c2_inputs = self._convert_inputs(batched_inputs)
+ c2_results = self.protobuf_model(c2_inputs)
+ c2_results = dict(zip(self.protobuf_model.net.Proto().external_output, c2_results))
+ return self._convert_outputs(batched_inputs, c2_inputs, c2_results)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_modeling.py b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..932876c3f47811aa2a4caf9c1c67fa367907787b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_modeling.py
@@ -0,0 +1,419 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import functools
+import io
+import struct
+import types
+import torch
+
+from custom_detectron2.modeling import meta_arch
+from custom_detectron2.modeling.box_regression import Box2BoxTransform
+from custom_detectron2.modeling.roi_heads import keypoint_head
+from custom_detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+
+from .c10 import Caffe2Compatible
+from .caffe2_patch import ROIHeadsPatcher, patch_generalized_rcnn
+from .shared import (
+ alias,
+ check_set_pb_arg,
+ get_pb_arg_floats,
+ get_pb_arg_valf,
+ get_pb_arg_vali,
+ get_pb_arg_vals,
+ mock_torch_nn_functional_interpolate,
+)
+
+
+def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
+ """
+ A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
+ to detectron2's format (i.e. list of Instances instance).
+ This only works when the model follows the Caffe2 detectron's naming convention.
+
+ Args:
+ image_sizes (List[List[int, int]]): [H, W] of every image.
+ tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
+
+ force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
+ if the mask is not found from tensor_outputs (usually due to model crash)
+ """
+
+ results = [Instances(image_size) for image_size in image_sizes]
+
+ batch_splits = tensor_outputs.get("batch_splits", None)
+ if batch_splits:
+ raise NotImplementedError()
+ assert len(image_sizes) == 1
+ result = results[0]
+
+ bbox_nms = tensor_outputs["bbox_nms"]
+ score_nms = tensor_outputs["score_nms"]
+ class_nms = tensor_outputs["class_nms"]
+ # Detection will always success because Conv support 0-batch
+ assert bbox_nms is not None
+ assert score_nms is not None
+ assert class_nms is not None
+ if bbox_nms.shape[1] == 5:
+ result.pred_boxes = RotatedBoxes(bbox_nms)
+ else:
+ result.pred_boxes = Boxes(bbox_nms)
+ result.scores = score_nms
+ result.pred_classes = class_nms.to(torch.int64)
+
+ mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
+ if mask_fcn_probs is not None:
+ # finish the mask pred
+ mask_probs_pred = mask_fcn_probs
+ num_masks = mask_probs_pred.shape[0]
+ class_pred = result.pred_classes
+ indices = torch.arange(num_masks, device=class_pred.device)
+ mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
+ result.pred_masks = mask_probs_pred
+ elif force_mask_on:
+ # NOTE: there's no way to know the height/width of mask here, it won't be
+ # used anyway when batch size is 0, so just set them to 0.
+ result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)
+
+ keypoints_out = tensor_outputs.get("keypoints_out", None)
+ kps_score = tensor_outputs.get("kps_score", None)
+ if keypoints_out is not None:
+ # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
+ keypoints_tensor = keypoints_out
+ # NOTE: it's possible that prob is not calculated if "should_output_softmax"
+ # is set to False in HeatmapMaxKeypoint, so just using raw score, seems
+ # it doesn't affect mAP. TODO: check more carefully.
+ keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
+ result.pred_keypoints = keypoint_xyp
+ elif kps_score is not None:
+ # keypoint heatmap to sparse data structure
+ pred_keypoint_logits = kps_score
+ keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])
+
+ return results
+
+
+def _cast_to_f32(f64):
+ return struct.unpack("f", struct.pack("f", f64))[0]
+
+
+def set_caffe2_compatible_tensor_mode(model, enable=True):
+ def _fn(m):
+ if isinstance(m, Caffe2Compatible):
+ m.tensor_mode = enable
+
+ model.apply(_fn)
+
+
+def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device):
+ """
+ See get_caffe2_inputs() below.
+ """
+ assert all(isinstance(x, dict) for x in batched_inputs)
+ assert all(x["image"].dim() == 3 for x in batched_inputs)
+
+ images = [x["image"] for x in batched_inputs]
+ images = ImageList.from_tensors(images, size_divisibility)
+
+ im_info = []
+ for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
+ target_height = input_per_image.get("height", image_size[0])
+ target_width = input_per_image.get("width", image_size[1]) # noqa
+ # NOTE: The scale inside im_info is kept as convention and for providing
+ # post-processing information if further processing is needed. For
+ # current Caffe2 model definitions that don't include post-processing inside
+ # the model, this number is not used.
+ # NOTE: There can be a slight difference between width and height
+ # scales, using a single number can results in numerical difference
+ # compared with D2's post-processing.
+ scale = target_height / image_size[0]
+ im_info.append([image_size[0], image_size[1], scale])
+ im_info = torch.Tensor(im_info)
+
+ return images.tensor.to(device), im_info.to(device)
+
+
+class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module):
+ """
+ Base class for caffe2-compatible implementation of a meta architecture.
+ The forward is traceable and its traced graph can be converted to caffe2
+ graph through ONNX.
+ """
+
+ def __init__(self, cfg, torch_model):
+ """
+ Args:
+ cfg (CfgNode):
+ torch_model (nn.Module): the detectron2 model (meta_arch) to be
+ converted.
+ """
+ super().__init__()
+ self._wrapped_model = torch_model
+ self.eval()
+ set_caffe2_compatible_tensor_mode(self, True)
+
+ def get_caffe2_inputs(self, batched_inputs):
+ """
+ Convert pytorch-style structured inputs to caffe2-style inputs that
+ are tuples of tensors.
+
+ Args:
+ batched_inputs (list[dict]): inputs to a detectron2 model
+ in its standard format. Each dict has "image" (CHW tensor), and optionally
+ "height" and "width".
+
+ Returns:
+ tuple[Tensor]:
+ tuple of tensors that will be the inputs to the
+ :meth:`forward` method. For existing models, the first
+ is an NCHW tensor (padded and batched); the second is
+ a im_info Nx3 tensor, where the rows are
+ (height, width, unused legacy parameter)
+ """
+ return convert_batched_inputs_to_c2_format(
+ batched_inputs,
+ self._wrapped_model.backbone.size_divisibility,
+ self._wrapped_model.device,
+ )
+
+ def encode_additional_info(self, predict_net, init_net):
+ """
+ Save extra metadata that will be used by inference in the output protobuf.
+ """
+ pass
+
+ def forward(self, inputs):
+ """
+ Run the forward in caffe2-style. It has to use caffe2-compatible ops
+ and the method will be used for tracing.
+
+ Args:
+ inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
+ They will be the inputs of the converted caffe2 graph.
+
+ Returns:
+ tuple[Tensor]: output tensors. They will be the outputs of the
+ converted caffe2 graph.
+ """
+ raise NotImplementedError
+
+ def _caffe2_preprocess_image(self, inputs):
+ """
+ Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
+ It normalizes the input images, and the final caffe2 graph assumes the
+ inputs have been batched already.
+ """
+ data, im_info = inputs
+ data = alias(data, "data")
+ im_info = alias(im_info, "im_info")
+ mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std
+ normalized_data = (data - mean) / std
+ normalized_data = alias(normalized_data, "normalized_data")
+
+ # Pack (data, im_info) into ImageList which is recognized by self.inference.
+ images = ImageList(tensor=normalized_data, image_sizes=im_info)
+ return images
+
+ @staticmethod
+ def get_outputs_converter(predict_net, init_net):
+ """
+ Creates a function that converts outputs of the caffe2 model to
+ detectron2's standard format.
+ The function uses information in `predict_net` and `init_net` that are
+ available at inferene time. Therefore the function logic can be used in inference.
+
+ The returned function has the following signature:
+
+ def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
+
+ Where
+
+ * batched_inputs (list[dict]): the original input format of the meta arch
+ * c2_inputs (tuple[Tensor]): the caffe2 inputs.
+ * c2_results (dict[str, Tensor]): the caffe2 output format,
+ corresponding to the outputs of the :meth:`forward` function.
+ * detectron2_outputs: the original output format of the meta arch.
+
+ This function can be used to compare the outputs of the original meta arch and
+ the converted caffe2 graph.
+
+ Returns:
+ callable: a callable of the above signature.
+ """
+ raise NotImplementedError
+
+
+class Caffe2GeneralizedRCNN(Caffe2MetaArch):
+ def __init__(self, cfg, torch_model):
+ assert isinstance(torch_model, meta_arch.GeneralizedRCNN)
+ torch_model = patch_generalized_rcnn(torch_model)
+ super().__init__(cfg, torch_model)
+
+ try:
+ use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT
+ except AttributeError:
+ use_heatmap_max_keypoint = False
+ self.roi_heads_patcher = ROIHeadsPatcher(
+ self._wrapped_model.roi_heads, use_heatmap_max_keypoint
+ )
+
+ def encode_additional_info(self, predict_net, init_net):
+ size_divisibility = self._wrapped_model.backbone.size_divisibility
+ check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+ check_set_pb_arg(
+ predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+ )
+ check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN")
+
+ @mock_torch_nn_functional_interpolate()
+ def forward(self, inputs):
+ if not self.tensor_mode:
+ return self._wrapped_model.inference(inputs)
+ images = self._caffe2_preprocess_image(inputs)
+ features = self._wrapped_model.backbone(images.tensor)
+ proposals, _ = self._wrapped_model.proposal_generator(images, features)
+ with self.roi_heads_patcher.mock_roi_heads():
+ detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
+ return tuple(detector_results[0].flatten())
+
+ @staticmethod
+ def get_outputs_converter(predict_net, init_net):
+ def f(batched_inputs, c2_inputs, c2_results):
+ _, im_info = c2_inputs
+ image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
+ results = assemble_rcnn_outputs_by_name(image_sizes, c2_results)
+ return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+
+ return f
+
+
+class Caffe2RetinaNet(Caffe2MetaArch):
+ def __init__(self, cfg, torch_model):
+ assert isinstance(torch_model, meta_arch.RetinaNet)
+ super().__init__(cfg, torch_model)
+
+ @mock_torch_nn_functional_interpolate()
+ def forward(self, inputs):
+ assert self.tensor_mode
+ images = self._caffe2_preprocess_image(inputs)
+
+ # explicitly return the images sizes to avoid removing "im_info" by ONNX
+ # since it's not used in the forward path
+ return_tensors = [images.image_sizes]
+
+ features = self._wrapped_model.backbone(images.tensor)
+ features = [features[f] for f in self._wrapped_model.head_in_features]
+ for i, feature_i in enumerate(features):
+ features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True)
+ return_tensors.append(features[i])
+
+ pred_logits, pred_anchor_deltas = self._wrapped_model.head(features)
+ for i, (box_cls_i, box_delta_i) in enumerate(zip(pred_logits, pred_anchor_deltas)):
+ return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i)))
+ return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i)))
+
+ return tuple(return_tensors)
+
+ def encode_additional_info(self, predict_net, init_net):
+ size_divisibility = self._wrapped_model.backbone.size_divisibility
+ check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+ check_set_pb_arg(
+ predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+ )
+ check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet")
+
+ # Inference parameters:
+ check_set_pb_arg(
+ predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.test_score_thresh)
+ )
+ check_set_pb_arg(
+ predict_net, "topk_candidates", "i", self._wrapped_model.test_topk_candidates
+ )
+ check_set_pb_arg(
+ predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.test_nms_thresh)
+ )
+ check_set_pb_arg(
+ predict_net,
+ "max_detections_per_image",
+ "i",
+ self._wrapped_model.max_detections_per_image,
+ )
+
+ check_set_pb_arg(
+ predict_net,
+ "bbox_reg_weights",
+ "floats",
+ [_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights],
+ )
+ self._encode_anchor_generator_cfg(predict_net)
+
+ def _encode_anchor_generator_cfg(self, predict_net):
+ # serialize anchor_generator for future use
+ serialized_anchor_generator = io.BytesIO()
+ torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator)
+ # Ideally we can put anchor generating inside the model, then we don't
+ # need to store this information.
+ bytes = serialized_anchor_generator.getvalue()
+ check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes)
+
+ @staticmethod
+ def get_outputs_converter(predict_net, init_net):
+ self = types.SimpleNamespace()
+ serialized_anchor_generator = io.BytesIO(
+ get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)
+ )
+ self.anchor_generator = torch.load(serialized_anchor_generator)
+ bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None)
+ self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights))
+ self.test_score_thresh = get_pb_arg_valf(predict_net, "score_threshold", None)
+ self.test_topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None)
+ self.test_nms_thresh = get_pb_arg_valf(predict_net, "nms_threshold", None)
+ self.max_detections_per_image = get_pb_arg_vali(
+ predict_net, "max_detections_per_image", None
+ )
+
+ # hack to reuse inference code from RetinaNet
+ for meth in [
+ "forward_inference",
+ "inference_single_image",
+ "_transpose_dense_predictions",
+ "_decode_multi_level_predictions",
+ "_decode_per_level_predictions",
+ ]:
+ setattr(self, meth, functools.partial(getattr(meta_arch.RetinaNet, meth), self))
+
+ def f(batched_inputs, c2_inputs, c2_results):
+ _, im_info = c2_inputs
+ image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
+ dummy_images = ImageList(
+ torch.randn(
+ (
+ len(im_info),
+ 3,
+ )
+ + tuple(image_sizes[0])
+ ),
+ image_sizes,
+ )
+
+ num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")])
+ pred_logits = [c2_results["box_cls_{}".format(i)] for i in range(num_features)]
+ pred_anchor_deltas = [c2_results["box_delta_{}".format(i)] for i in range(num_features)]
+
+ # For each feature level, feature should have the same batch size and
+ # spatial dimension as the box_cls and box_delta.
+ dummy_features = [x.clone()[:, 0:0, :, :] for x in pred_logits]
+ # self.num_classess can be inferred
+ self.num_classes = pred_logits[0].shape[1] // (pred_anchor_deltas[0].shape[1] // 4)
+
+ results = self.forward_inference(
+ dummy_images, dummy_features, [pred_logits, pred_anchor_deltas]
+ )
+ return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+
+ return f
+
+
+META_ARCH_CAFFE2_EXPORT_TYPE_MAP = {
+ "GeneralizedRCNN": Caffe2GeneralizedRCNN,
+ "RetinaNet": Caffe2RetinaNet,
+}
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_patch.py b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..deab85d361cc07289ee08e4c01df23dc9ff3fbb1
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/caffe2_patch.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import contextlib
+from unittest import mock
+import torch
+
+from custom_detectron2.modeling import poolers
+from custom_detectron2.modeling.proposal_generator import rpn
+from custom_detectron2.modeling.roi_heads import keypoint_head, mask_head
+from custom_detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
+
+from .c10 import (
+ Caffe2Compatible,
+ Caffe2FastRCNNOutputsInference,
+ Caffe2KeypointRCNNInference,
+ Caffe2MaskRCNNInference,
+ Caffe2ROIPooler,
+ Caffe2RPN,
+)
+
+
+class GenericMixin(object):
+ pass
+
+
+class Caffe2CompatibleConverter(object):
+ """
+ A GenericUpdater which implements the `create_from` interface, by modifying
+ module object and assign it with another class replaceCls.
+ """
+
+ def __init__(self, replaceCls):
+ self.replaceCls = replaceCls
+
+ def create_from(self, module):
+ # update module's class to the new class
+ assert isinstance(module, torch.nn.Module)
+ if issubclass(self.replaceCls, GenericMixin):
+ # replaceCls should act as mixin, create a new class on-the-fly
+ new_class = type(
+ "{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__),
+ (self.replaceCls, module.__class__),
+ {}, # {"new_method": lambda self: ...},
+ )
+ module.__class__ = new_class
+ else:
+ # replaceCls is complete class, this allow arbitrary class swap
+ module.__class__ = self.replaceCls
+
+ # initialize Caffe2Compatible
+ if isinstance(module, Caffe2Compatible):
+ module.tensor_mode = False
+
+ return module
+
+
+def patch(model, target, updater, *args, **kwargs):
+ """
+ recursively (post-order) update all modules with the target type and its
+ subclasses, make a initialization/composition/inheritance/... via the
+ updater.create_from.
+ """
+ for name, module in model.named_children():
+ model._modules[name] = patch(module, target, updater, *args, **kwargs)
+ if isinstance(model, target):
+ return updater.create_from(model, *args, **kwargs)
+ return model
+
+
+def patch_generalized_rcnn(model):
+ ccc = Caffe2CompatibleConverter
+ model = patch(model, rpn.RPN, ccc(Caffe2RPN))
+ model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler))
+
+ return model
+
+
+@contextlib.contextmanager
+def mock_fastrcnn_outputs_inference(
+ tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers
+):
+ with mock.patch.object(
+ box_predictor_type,
+ "inference",
+ autospec=True,
+ side_effect=Caffe2FastRCNNOutputsInference(tensor_mode),
+ ) as mocked_func:
+ yield
+ if check:
+ assert mocked_func.call_count > 0
+
+
+@contextlib.contextmanager
+def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True):
+ with mock.patch(
+ "{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference()
+ ) as mocked_func:
+ yield
+ if check:
+ assert mocked_func.call_count > 0
+
+
+@contextlib.contextmanager
+def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True):
+ with mock.patch(
+ "{}.keypoint_rcnn_inference".format(patched_module),
+ side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint),
+ ) as mocked_func:
+ yield
+ if check:
+ assert mocked_func.call_count > 0
+
+
+class ROIHeadsPatcher:
+ def __init__(self, heads, use_heatmap_max_keypoint):
+ self.heads = heads
+ self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+
+ @contextlib.contextmanager
+ def mock_roi_heads(self, tensor_mode=True):
+ """
+ Patching several inference functions inside ROIHeads and its subclasses
+
+ Args:
+ tensor_mode (bool): whether the inputs/outputs are caffe2's tensor
+ format or not. Default to True.
+ """
+ # NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference`
+ # are called inside the same file as BaseXxxHead due to using mock.patch.
+ kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__
+ mask_head_mod = mask_head.BaseMaskRCNNHead.__module__
+
+ mock_ctx_managers = [
+ mock_fastrcnn_outputs_inference(
+ tensor_mode=tensor_mode,
+ check=True,
+ box_predictor_type=type(self.heads.box_predictor),
+ )
+ ]
+ if getattr(self.heads, "keypoint_on", False):
+ mock_ctx_managers += [
+ mock_keypoint_rcnn_inference(
+ tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint
+ )
+ ]
+ if getattr(self.heads, "mask_on", False):
+ mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)]
+
+ with contextlib.ExitStack() as stack: # python 3.3+
+ for mgr in mock_ctx_managers:
+ stack.enter_context(mgr)
+ yield
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/flatten.py b/comfyui_controlnet_aux/src/custom_detectron2/export/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..56111fe5bd38d3b42afe5855e2b6ec3d20fc1a69
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/flatten.py
@@ -0,0 +1,330 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import collections
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple
+import torch
+from torch import nn
+
+from custom_detectron2.structures import Boxes, Instances, ROIMasks
+from custom_detectron2.utils.registry import _convert_target_to_string, locate
+
+from .torchscript_patch import patch_builtin_len
+
+
+@dataclass
+class Schema:
+ """
+ A Schema defines how to flatten a possibly hierarchical object into tuple of
+ primitive objects, so it can be used as inputs/outputs of PyTorch's tracing.
+
+ PyTorch does not support tracing a function that produces rich output
+ structures (e.g. dict, Instances, Boxes). To trace such a function, we
+ flatten the rich object into tuple of tensors, and return this tuple of tensors
+ instead. Meanwhile, we also need to know how to "rebuild" the original object
+ from the flattened results, so we can evaluate the flattened results.
+ A Schema defines how to flatten an object, and while flattening it, it records
+ necessary schemas so that the object can be rebuilt using the flattened outputs.
+
+ The flattened object and the schema object is returned by ``.flatten`` classmethod.
+ Then the original object can be rebuilt with the ``__call__`` method of schema.
+
+ A Schema is a dataclass that can be serialized easily.
+ """
+
+ # inspired by FetchMapper in tensorflow/python/client/session.py
+
+ @classmethod
+ def flatten(cls, obj):
+ raise NotImplementedError
+
+ def __call__(self, values):
+ raise NotImplementedError
+
+ @staticmethod
+ def _concat(values):
+ ret = ()
+ sizes = []
+ for v in values:
+ assert isinstance(v, tuple), "Flattened results must be a tuple"
+ ret = ret + v
+ sizes.append(len(v))
+ return ret, sizes
+
+ @staticmethod
+ def _split(values, sizes):
+ if len(sizes):
+ expected_len = sum(sizes)
+ assert (
+ len(values) == expected_len
+ ), f"Values has length {len(values)} but expect length {expected_len}."
+ ret = []
+ for k in range(len(sizes)):
+ begin, end = sum(sizes[:k]), sum(sizes[: k + 1])
+ ret.append(values[begin:end])
+ return ret
+
+
+@dataclass
+class ListSchema(Schema):
+ schemas: List[Schema] # the schemas that define how to flatten each element in the list
+ sizes: List[int] # the flattened length of each element
+
+ def __call__(self, values):
+ values = self._split(values, self.sizes)
+ if len(values) != len(self.schemas):
+ raise ValueError(
+ f"Values has length {len(values)} but schemas " f"has length {len(self.schemas)}!"
+ )
+ values = [m(v) for m, v in zip(self.schemas, values)]
+ return list(values)
+
+ @classmethod
+ def flatten(cls, obj):
+ res = [flatten_to_tuple(k) for k in obj]
+ values, sizes = cls._concat([k[0] for k in res])
+ return values, cls([k[1] for k in res], sizes)
+
+
+@dataclass
+class TupleSchema(ListSchema):
+ def __call__(self, values):
+ return tuple(super().__call__(values))
+
+
+@dataclass
+class IdentitySchema(Schema):
+ def __call__(self, values):
+ return values[0]
+
+ @classmethod
+ def flatten(cls, obj):
+ return (obj,), cls()
+
+
+@dataclass
+class DictSchema(ListSchema):
+ keys: List[str]
+
+ def __call__(self, values):
+ values = super().__call__(values)
+ return dict(zip(self.keys, values))
+
+ @classmethod
+ def flatten(cls, obj):
+ for k in obj.keys():
+ if not isinstance(k, str):
+ raise KeyError("Only support flattening dictionaries if keys are str.")
+ keys = sorted(obj.keys())
+ values = [obj[k] for k in keys]
+ ret, schema = ListSchema.flatten(values)
+ return ret, cls(schema.schemas, schema.sizes, keys)
+
+
+@dataclass
+class InstancesSchema(DictSchema):
+ def __call__(self, values):
+ image_size, fields = values[-1], values[:-1]
+ fields = super().__call__(fields)
+ return Instances(image_size, **fields)
+
+ @classmethod
+ def flatten(cls, obj):
+ ret, schema = super().flatten(obj.get_fields())
+ size = obj.image_size
+ if not isinstance(size, torch.Tensor):
+ size = torch.tensor(size)
+ return ret + (size,), schema
+
+
+@dataclass
+class TensorWrapSchema(Schema):
+ """
+ For classes that are simple wrapper of tensors, e.g.
+ Boxes, RotatedBoxes, BitMasks
+ """
+
+ class_name: str
+
+ def __call__(self, values):
+ return locate(self.class_name)(values[0])
+
+ @classmethod
+ def flatten(cls, obj):
+ return (obj.tensor,), cls(_convert_target_to_string(type(obj)))
+
+
+# if more custom structures needed in the future, can allow
+# passing in extra schemas for custom types
+def flatten_to_tuple(obj):
+ """
+ Flatten an object so it can be used for PyTorch tracing.
+ Also returns how to rebuild the original object from the flattened outputs.
+
+ Returns:
+ res (tuple): the flattened results that can be used as tracing outputs
+ schema: an object with a ``__call__`` method such that ``schema(res) == obj``.
+ It is a pure dataclass that can be serialized.
+ """
+ schemas = [
+ ((str, bytes), IdentitySchema),
+ (list, ListSchema),
+ (tuple, TupleSchema),
+ (collections.abc.Mapping, DictSchema),
+ (Instances, InstancesSchema),
+ ((Boxes, ROIMasks), TensorWrapSchema),
+ ]
+ for klass, schema in schemas:
+ if isinstance(obj, klass):
+ F = schema
+ break
+ else:
+ F = IdentitySchema
+
+ return F.flatten(obj)
+
+
+class TracingAdapter(nn.Module):
+ """
+ A model may take rich input/output format (e.g. dict or custom classes),
+ but `torch.jit.trace` requires tuple of tensors as input/output.
+ This adapter flattens input/output format of a model so it becomes traceable.
+
+ It also records the necessary schema to rebuild model's inputs/outputs from flattened
+ inputs/outputs.
+
+ Example:
+ ::
+ outputs = model(inputs) # inputs/outputs may be rich structure
+ adapter = TracingAdapter(model, inputs)
+
+ # can now trace the model, with adapter.flattened_inputs, or another
+ # tuple of tensors with the same length and meaning
+ traced = torch.jit.trace(adapter, adapter.flattened_inputs)
+
+ # traced model can only produce flattened outputs (tuple of tensors)
+ flattened_outputs = traced(*adapter.flattened_inputs)
+ # adapter knows the schema to convert it back (new_outputs == outputs)
+ new_outputs = adapter.outputs_schema(flattened_outputs)
+ """
+
+ flattened_inputs: Tuple[torch.Tensor] = None
+ """
+ Flattened version of inputs given to this class's constructor.
+ """
+
+ inputs_schema: Schema = None
+ """
+ Schema of the inputs given to this class's constructor.
+ """
+
+ outputs_schema: Schema = None
+ """
+ Schema of the output produced by calling the given model with inputs.
+ """
+
+ def __init__(
+ self,
+ model: nn.Module,
+ inputs,
+ inference_func: Optional[Callable] = None,
+ allow_non_tensor: bool = False,
+ ):
+ """
+ Args:
+ model: an nn.Module
+ inputs: An input argument or a tuple of input arguments used to call model.
+ After flattening, it has to only consist of tensors.
+ inference_func: a callable that takes (model, *inputs), calls the
+ model with inputs, and return outputs. By default it
+ is ``lambda model, *inputs: model(*inputs)``. Can be override
+ if you need to call the model differently.
+ allow_non_tensor: allow inputs/outputs to contain non-tensor objects.
+ This option will filter out non-tensor objects to make the
+ model traceable, but ``inputs_schema``/``outputs_schema`` cannot be
+ used anymore because inputs/outputs cannot be rebuilt from pure tensors.
+ This is useful when you're only interested in the single trace of
+ execution (e.g. for flop count), but not interested in
+ generalizing the traced graph to new inputs.
+ """
+ super().__init__()
+ if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)):
+ model = model.module
+ self.model = model
+ if not isinstance(inputs, tuple):
+ inputs = (inputs,)
+ self.inputs = inputs
+ self.allow_non_tensor = allow_non_tensor
+
+ if inference_func is None:
+ inference_func = lambda model, *inputs: model(*inputs) # noqa
+ self.inference_func = inference_func
+
+ self.flattened_inputs, self.inputs_schema = flatten_to_tuple(inputs)
+
+ if all(isinstance(x, torch.Tensor) for x in self.flattened_inputs):
+ return
+ if self.allow_non_tensor:
+ self.flattened_inputs = tuple(
+ [x for x in self.flattened_inputs if isinstance(x, torch.Tensor)]
+ )
+ self.inputs_schema = None
+ else:
+ for input in self.flattened_inputs:
+ if not isinstance(input, torch.Tensor):
+ raise ValueError(
+ "Inputs for tracing must only contain tensors. "
+ f"Got a {type(input)} instead."
+ )
+
+ def forward(self, *args: torch.Tensor):
+ with torch.no_grad(), patch_builtin_len():
+ if self.inputs_schema is not None:
+ inputs_orig_format = self.inputs_schema(args)
+ else:
+ if len(args) != len(self.flattened_inputs) or any(
+ x is not y for x, y in zip(args, self.flattened_inputs)
+ ):
+ raise ValueError(
+ "TracingAdapter does not contain valid inputs_schema."
+ " So it cannot generalize to other inputs and must be"
+ " traced with `.flattened_inputs`."
+ )
+ inputs_orig_format = self.inputs
+
+ outputs = self.inference_func(self.model, *inputs_orig_format)
+ flattened_outputs, schema = flatten_to_tuple(outputs)
+
+ flattened_output_tensors = tuple(
+ [x for x in flattened_outputs if isinstance(x, torch.Tensor)]
+ )
+ if len(flattened_output_tensors) < len(flattened_outputs):
+ if self.allow_non_tensor:
+ flattened_outputs = flattened_output_tensors
+ self.outputs_schema = None
+ else:
+ raise ValueError(
+ "Model cannot be traced because some model outputs "
+ "cannot flatten to tensors."
+ )
+ else: # schema is valid
+ if self.outputs_schema is None:
+ self.outputs_schema = schema
+ else:
+ assert self.outputs_schema == schema, (
+ "Model should always return outputs with the same "
+ "structure so it can be traced!"
+ )
+ return flattened_outputs
+
+ def _create_wrapper(self, traced_model):
+ """
+ Return a function that has an input/output interface the same as the
+ original model, but it calls the given traced model under the hood.
+ """
+
+ def forward(*args):
+ flattened_inputs, _ = flatten_to_tuple(args)
+ flattened_outputs = traced_model(*flattened_inputs)
+ return self.outputs_schema(flattened_outputs)
+
+ return forward
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/shared.py b/comfyui_controlnet_aux/src/custom_detectron2/export/shared.py
new file mode 100644
index 0000000000000000000000000000000000000000..b03af8f7e85b3eb30aa5f4433db6e56e2983620b
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/shared.py
@@ -0,0 +1,1039 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import collections
+import copy
+import functools
+import logging
+import numpy as np
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest import mock
+import caffe2.python.utils as putils
+import torch
+import torch.nn.functional as F
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, net_drawer, workspace
+from torch.nn.functional import interpolate as interp
+
+logger = logging.getLogger(__name__)
+
+
+# ==== torch/utils_toffee/cast.py =======================================
+
+
+def to_device(t, device_str):
+ """
+ This function is a replacement of .to(another_device) such that it allows the
+ casting to be traced properly by explicitly calling the underlying copy ops.
+ It also avoids introducing unncessary op when casting to the same device.
+ """
+ src = t.device
+ dst = torch.device(device_str)
+
+ if src == dst:
+ return t
+ elif src.type == "cuda" and dst.type == "cpu":
+ return torch.ops._caffe2.CopyGPUToCPU(t)
+ elif src.type == "cpu" and dst.type == "cuda":
+ return torch.ops._caffe2.CopyCPUToGPU(t)
+ else:
+ raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst))
+
+
+# ==== torch/utils_toffee/interpolate.py =======================================
+
+
+# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py
+def BilinearInterpolation(tensor_in, up_scale):
+ assert up_scale % 2 == 0, "Scale should be even"
+
+ def upsample_filt(size):
+ factor = (size + 1) // 2
+ if size % 2 == 1:
+ center = factor - 1
+ else:
+ center = factor - 0.5
+
+ og = np.ogrid[:size, :size]
+ return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
+
+ kernel_size = int(up_scale) * 2
+ bil_filt = upsample_filt(kernel_size)
+
+ dim = int(tensor_in.shape[1])
+ kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32)
+ kernel[range(dim), range(dim), :, :] = bil_filt
+
+ tensor_out = F.conv_transpose2d(
+ tensor_in,
+ weight=to_device(torch.Tensor(kernel), tensor_in.device),
+ bias=None,
+ stride=int(up_scale),
+ padding=int(up_scale / 2),
+ )
+
+ return tensor_out
+
+
+# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if
+# using dynamic `scale_factor` rather than static `size`. (T43166860)
+# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly.
+def onnx_compatibale_interpolate(
+ input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
+ # NOTE: The input dimensions are interpreted in the form:
+ # `mini-batch x channels x [optional depth] x [optional height] x width`.
+ if size is None and scale_factor is not None:
+ if input.dim() == 4:
+ if isinstance(scale_factor, (int, float)):
+ height_scale, width_scale = (scale_factor, scale_factor)
+ else:
+ assert isinstance(scale_factor, (tuple, list))
+ assert len(scale_factor) == 2
+ height_scale, width_scale = scale_factor
+
+ assert not align_corners, "No matching C2 op for align_corners == True"
+ if mode == "nearest":
+ return torch.ops._caffe2.ResizeNearest(
+ input, order="NCHW", width_scale=width_scale, height_scale=height_scale
+ )
+ elif mode == "bilinear":
+ logger.warning(
+ "Use F.conv_transpose2d for bilinear interpolate"
+ " because there's no such C2 op, this may cause significant"
+ " slowdown and the boundary pixels won't be as same as"
+ " using F.interpolate due to padding."
+ )
+ assert height_scale == width_scale
+ return BilinearInterpolation(input, up_scale=height_scale)
+ logger.warning("Output size is not static, it might cause ONNX conversion issue")
+
+ return interp(input, size, scale_factor, mode, align_corners)
+
+
+def mock_torch_nn_functional_interpolate():
+ def decorator(func):
+ @functools.wraps(func)
+ def _mock_torch_nn_functional_interpolate(*args, **kwargs):
+ if torch.onnx.is_in_onnx_export():
+ with mock.patch(
+ "torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate
+ ):
+ return func(*args, **kwargs)
+ else:
+ return func(*args, **kwargs)
+
+ return _mock_torch_nn_functional_interpolate
+
+ return decorator
+
+
+# ==== torch/utils_caffe2/ws_utils.py ==========================================
+
+
+class ScopedWS(object):
+ def __init__(self, ws_name, is_reset, is_cleanup=False):
+ self.ws_name = ws_name
+ self.is_reset = is_reset
+ self.is_cleanup = is_cleanup
+ self.org_ws = ""
+
+ def __enter__(self):
+ self.org_ws = workspace.CurrentWorkspace()
+ if self.ws_name is not None:
+ workspace.SwitchWorkspace(self.ws_name, True)
+ if self.is_reset:
+ workspace.ResetWorkspace()
+
+ return workspace
+
+ def __exit__(self, *args):
+ if self.is_cleanup:
+ workspace.ResetWorkspace()
+ if self.ws_name is not None:
+ workspace.SwitchWorkspace(self.org_ws)
+
+
+def fetch_any_blob(name):
+ bb = None
+ try:
+ bb = workspace.FetchBlob(name)
+ except TypeError:
+ bb = workspace.FetchInt8Blob(name)
+ except Exception as e:
+ logger.error("Get blob {} error: {}".format(name, e))
+
+ return bb
+
+
+# ==== torch/utils_caffe2/protobuf.py ==========================================
+
+
+def get_pb_arg(pb, arg_name):
+ for x in pb.arg:
+ if x.name == arg_name:
+ return x
+ return None
+
+
+def get_pb_arg_valf(pb, arg_name, default_val):
+ arg = get_pb_arg(pb, arg_name)
+ return arg.f if arg is not None else default_val
+
+
+def get_pb_arg_floats(pb, arg_name, default_val):
+ arg = get_pb_arg(pb, arg_name)
+ return list(map(float, arg.floats)) if arg is not None else default_val
+
+
+def get_pb_arg_ints(pb, arg_name, default_val):
+ arg = get_pb_arg(pb, arg_name)
+ return list(map(int, arg.ints)) if arg is not None else default_val
+
+
+def get_pb_arg_vali(pb, arg_name, default_val):
+ arg = get_pb_arg(pb, arg_name)
+ return arg.i if arg is not None else default_val
+
+
+def get_pb_arg_vals(pb, arg_name, default_val):
+ arg = get_pb_arg(pb, arg_name)
+ return arg.s if arg is not None else default_val
+
+
+def get_pb_arg_valstrings(pb, arg_name, default_val):
+ arg = get_pb_arg(pb, arg_name)
+ return list(arg.strings) if arg is not None else default_val
+
+
+def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False):
+ arg = get_pb_arg(pb, arg_name)
+ if arg is None:
+ arg = putils.MakeArgument(arg_name, arg_value)
+ assert hasattr(arg, arg_attr)
+ pb.arg.extend([arg])
+ if allow_override and getattr(arg, arg_attr) != arg_value:
+ logger.warning(
+ "Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value)
+ )
+ setattr(arg, arg_attr, arg_value)
+ else:
+ assert arg is not None
+ assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format(
+ getattr(arg, arg_attr), arg_value
+ )
+
+
+def _create_const_fill_op_from_numpy(name, tensor, device_option=None):
+ assert type(tensor) == np.ndarray
+ kTypeNameMapper = {
+ np.dtype("float32"): "GivenTensorFill",
+ np.dtype("int32"): "GivenTensorIntFill",
+ np.dtype("int64"): "GivenTensorInt64Fill",
+ np.dtype("uint8"): "GivenTensorStringFill",
+ }
+
+ args_dict = {}
+ if tensor.dtype == np.dtype("uint8"):
+ args_dict.update({"values": [str(tensor.data)], "shape": [1]})
+ else:
+ args_dict.update({"values": tensor, "shape": tensor.shape})
+
+ if device_option is not None:
+ args_dict["device_option"] = device_option
+
+ return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict)
+
+
+def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor):
+ assert type(int8_tensor) == workspace.Int8Tensor
+ kTypeNameMapper = {
+ np.dtype("int32"): "Int8GivenIntTensorFill",
+ np.dtype("uint8"): "Int8GivenTensorFill",
+ }
+
+ tensor = int8_tensor.data
+ assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")]
+ values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor
+
+ return core.CreateOperator(
+ kTypeNameMapper[tensor.dtype],
+ [],
+ [name],
+ values=values,
+ shape=tensor.shape,
+ Y_scale=int8_tensor.scale,
+ Y_zero_point=int8_tensor.zero_point,
+ )
+
+
+def create_const_fill_op(
+ name: str,
+ blob: Union[np.ndarray, workspace.Int8Tensor],
+ device_option: Optional[caffe2_pb2.DeviceOption] = None,
+) -> caffe2_pb2.OperatorDef:
+ """
+ Given a blob object, return the Caffe2 operator that creates this blob
+ as constant. Currently support NumPy tensor and Caffe2 Int8Tensor.
+ """
+
+ tensor_type = type(blob)
+ assert tensor_type in [
+ np.ndarray,
+ workspace.Int8Tensor,
+ ], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format(
+ name, type(blob)
+ )
+
+ if tensor_type == np.ndarray:
+ return _create_const_fill_op_from_numpy(name, blob, device_option)
+ elif tensor_type == workspace.Int8Tensor:
+ assert device_option is None
+ return _create_const_fill_op_from_c2_int8_tensor(name, blob)
+
+
+def construct_init_net_from_params(
+ params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None
+) -> caffe2_pb2.NetDef:
+ """
+ Construct the init_net from params dictionary
+ """
+ init_net = caffe2_pb2.NetDef()
+ device_options = device_options or {}
+ for name, blob in params.items():
+ if isinstance(blob, str):
+ logger.warning(
+ (
+ "Blob {} with type {} is not supported in generating init net,"
+ " skipped.".format(name, type(blob))
+ )
+ )
+ continue
+ init_net.op.extend(
+ [create_const_fill_op(name, blob, device_option=device_options.get(name, None))]
+ )
+ init_net.external_output.append(name)
+ return init_net
+
+
+def get_producer_map(ssa):
+ """
+ Return dict from versioned blob to (i, j),
+ where i is index of producer op, j is the index of output of that op.
+ """
+ producer_map = {}
+ for i in range(len(ssa)):
+ outputs = ssa[i][1]
+ for j, outp in enumerate(outputs):
+ producer_map[outp] = (i, j)
+ return producer_map
+
+
+def get_consumer_map(ssa):
+ """
+ Return dict from versioned blob to list of (i, j),
+ where i is index of consumer op, j is the index of input of that op.
+ """
+ consumer_map = collections.defaultdict(list)
+ for i in range(len(ssa)):
+ inputs = ssa[i][0]
+ for j, inp in enumerate(inputs):
+ consumer_map[inp].append((i, j))
+ return consumer_map
+
+
+def get_params_from_init_net(
+ init_net: caffe2_pb2.NetDef,
+) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]:
+ """
+ Take the output blobs from init_net by running it.
+ Outputs:
+ params: dict from blob name to numpy array
+ device_options: dict from blob name to the device option of its creating op
+ """
+ # NOTE: this assumes that the params is determined by producer op with the
+ # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
+ def _get_device_option(producer_op):
+ if producer_op.type == "CopyGPUToCPU":
+ return caffe2_pb2.DeviceOption()
+ else:
+ return producer_op.device_option
+
+ with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws:
+ ws.RunNetOnce(init_net)
+ params = {b: fetch_any_blob(b) for b in init_net.external_output}
+ ssa, versions = core.get_ssa(init_net)
+ producer_map = get_producer_map(ssa)
+ device_options = {
+ b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]])
+ for b in init_net.external_output
+ }
+ return params, device_options
+
+
+def _updater_raise(op, input_types, output_types):
+ raise RuntimeError(
+ "Failed to apply updater for op {} given input_types {} and"
+ " output_types {}".format(op, input_types, output_types)
+ )
+
+
+def _generic_status_identifier(
+ predict_net: caffe2_pb2.NetDef,
+ status_updater: Callable,
+ known_status: Dict[Tuple[str, int], Any],
+) -> Dict[Tuple[str, int], Any]:
+ """
+ Statically infer the status of each blob, the status can be such as device type
+ (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
+ is versioned blob (Tuple[str, int]) in the format compatible with ssa.
+ Inputs:
+ predict_net: the caffe2 network
+ status_updater: a callable, given an op and the status of its input/output,
+ it returns the updated status of input/output. `None` is used for
+ representing unknown status.
+ known_status: a dict containing known status, used as initialization.
+ Outputs:
+ A dict mapping from versioned blob to its status
+ """
+ ssa, versions = core.get_ssa(predict_net)
+ versioned_ext_input = [(b, 0) for b in predict_net.external_input]
+ versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output]
+ all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa])
+
+ allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output)
+ assert all(k in allowed_vbs for k in known_status)
+ assert all(v is not None for v in known_status.values())
+ _known_status = copy.deepcopy(known_status)
+
+ def _check_and_update(key, value):
+ assert value is not None
+ if key in _known_status:
+ if not _known_status[key] == value:
+ raise RuntimeError(
+ "Confilict status for {}, existing status {}, new status {}".format(
+ key, _known_status[key], value
+ )
+ )
+ _known_status[key] = value
+
+ def _update_i(op, ssa_i):
+ versioned_inputs = ssa_i[0]
+ versioned_outputs = ssa_i[1]
+
+ inputs_status = [_known_status.get(b, None) for b in versioned_inputs]
+ outputs_status = [_known_status.get(b, None) for b in versioned_outputs]
+
+ new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status)
+
+ for versioned_blob, status in zip(
+ versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status
+ ):
+ if status is not None:
+ _check_and_update(versioned_blob, status)
+
+ for op, ssa_i in zip(predict_net.op, ssa):
+ _update_i(op, ssa_i)
+ for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)):
+ _update_i(op, ssa_i)
+
+ # NOTE: This strictly checks all the blob from predict_net must be assgined
+ # a known status. However sometimes it's impossible (eg. having deadend op),
+ # we may relax this constraint if
+ for k in all_versioned_blobs:
+ if k not in _known_status:
+ raise NotImplementedError(
+ "Can not infer the status for {}. Currently only support the case where"
+ " a single forward and backward pass can identify status for all blobs.".format(k)
+ )
+
+ return _known_status
+
+
+def infer_device_type(
+ predict_net: caffe2_pb2.NetDef,
+ known_status: Dict[Tuple[str, int], Any],
+ device_name_style: str = "caffe2",
+) -> Dict[Tuple[str, int], str]:
+ """Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob"""
+
+ assert device_name_style in ["caffe2", "pytorch"]
+ _CPU_STR = "cpu"
+ _GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda"
+
+ def _copy_cpu_to_gpu_updater(op, input_types, output_types):
+ if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR:
+ _updater_raise(op, input_types, output_types)
+ return ([_CPU_STR], [_GPU_STR])
+
+ def _copy_gpu_to_cpu_updater(op, input_types, output_types):
+ if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR:
+ _updater_raise(op, input_types, output_types)
+ return ([_GPU_STR], [_CPU_STR])
+
+ def _other_ops_updater(op, input_types, output_types):
+ non_none_types = [x for x in input_types + output_types if x is not None]
+ if len(non_none_types) > 0:
+ the_type = non_none_types[0]
+ if not all(x == the_type for x in non_none_types):
+ _updater_raise(op, input_types, output_types)
+ else:
+ the_type = None
+ return ([the_type for _ in op.input], [the_type for _ in op.output])
+
+ def _device_updater(op, *args, **kwargs):
+ return {
+ "CopyCPUToGPU": _copy_cpu_to_gpu_updater,
+ "CopyGPUToCPU": _copy_gpu_to_cpu_updater,
+ }.get(op.type, _other_ops_updater)(op, *args, **kwargs)
+
+ return _generic_status_identifier(predict_net, _device_updater, known_status)
+
+
+# ==== torch/utils_caffe2/vis.py ===============================================
+
+
+def _modify_blob_names(ops, blob_rename_f):
+ ret = []
+
+ def _replace_list(blob_list, replaced_list):
+ del blob_list[:]
+ blob_list.extend(replaced_list)
+
+ for x in ops:
+ cur = copy.deepcopy(x)
+ _replace_list(cur.input, list(map(blob_rename_f, cur.input)))
+ _replace_list(cur.output, list(map(blob_rename_f, cur.output)))
+ ret.append(cur)
+
+ return ret
+
+
+def _rename_blob(name, blob_sizes, blob_ranges):
+ def _list_to_str(bsize):
+ ret = ", ".join([str(x) for x in bsize])
+ ret = "[" + ret + "]"
+ return ret
+
+ ret = name
+ if blob_sizes is not None and name in blob_sizes:
+ ret += "\n" + _list_to_str(blob_sizes[name])
+ if blob_ranges is not None and name in blob_ranges:
+ ret += "\n" + _list_to_str(blob_ranges[name])
+
+ return ret
+
+
+# graph_name could not contain word 'graph'
+def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None):
+ blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges)
+ return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f)
+
+
+def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None):
+ graph = None
+ ops = net.op
+ if blob_rename_func is not None:
+ ops = _modify_blob_names(ops, blob_rename_func)
+ if not op_only:
+ graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB")
+ else:
+ graph = net_drawer.GetPydotGraphMinimal(
+ ops, graph_name, rankdir="TB", minimal_dependency=True
+ )
+
+ try:
+ par_dir = os.path.dirname(file_name)
+ if not os.path.exists(par_dir):
+ os.makedirs(par_dir)
+
+ format = os.path.splitext(os.path.basename(file_name))[-1]
+ if format == ".png":
+ graph.write_png(file_name)
+ elif format == ".pdf":
+ graph.write_pdf(file_name)
+ elif format == ".svg":
+ graph.write_svg(file_name)
+ else:
+ print("Incorrect format {}".format(format))
+ except Exception as e:
+ print("Error when writing graph to image {}".format(e))
+
+ return graph
+
+
+# ==== torch/utils_toffee/aten_to_caffe2.py ====================================
+
+
+def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef):
+ """
+ For ONNX exported model, GroupNorm will be represented as ATen op,
+ this can be a drop in replacement from ATen to GroupNorm
+ """
+ count = 0
+ for op in predict_net.op:
+ if op.type == "ATen":
+ op_name = get_pb_arg_vals(op, "operator", None) # return byte in py3
+ if op_name and op_name.decode() == "group_norm":
+ op.arg.remove(get_pb_arg(op, "operator"))
+
+ if get_pb_arg_vali(op, "cudnn_enabled", None):
+ op.arg.remove(get_pb_arg(op, "cudnn_enabled"))
+
+ num_groups = get_pb_arg_vali(op, "num_groups", None)
+ if num_groups is not None:
+ op.arg.remove(get_pb_arg(op, "num_groups"))
+ check_set_pb_arg(op, "group", "i", num_groups)
+
+ op.type = "GroupNorm"
+ count += 1
+ if count > 1:
+ logger.info("Replaced {} ATen operator to GroupNormOp".format(count))
+
+
+# ==== torch/utils_toffee/alias.py =============================================
+
+
+def alias(x, name, is_backward=False):
+ if not torch.onnx.is_in_onnx_export():
+ return x
+ assert isinstance(x, torch.Tensor)
+ return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward)
+
+
+def fuse_alias_placeholder(predict_net, init_net):
+ """Remove AliasWithName placeholder and rename the input/output of it"""
+ # First we finish all the re-naming
+ for i, op in enumerate(predict_net.op):
+ if op.type == "AliasWithName":
+ assert len(op.input) == 1
+ assert len(op.output) == 1
+ name = get_pb_arg_vals(op, "name", None).decode()
+ is_backward = bool(get_pb_arg_vali(op, "is_backward", 0))
+ rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward)
+ rename_op_output(predict_net, i, 0, name)
+
+ # Remove AliasWithName, should be very safe since it's a non-op
+ new_ops = []
+ for op in predict_net.op:
+ if op.type != "AliasWithName":
+ new_ops.append(op)
+ else:
+ # safety check
+ assert op.input == op.output
+ assert op.input[0] == op.arg[0].s.decode()
+ del predict_net.op[:]
+ predict_net.op.extend(new_ops)
+
+
+# ==== torch/utils_caffe2/graph_transform.py ===================================
+
+
+class IllegalGraphTransformError(ValueError):
+ """When a graph transform function call can't be executed."""
+
+
+def _rename_versioned_blob_in_proto(
+ proto: caffe2_pb2.NetDef,
+ old_name: str,
+ new_name: str,
+ version: int,
+ ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]],
+ start_versions: Dict[str, int],
+ end_versions: Dict[str, int],
+):
+ """In given proto, rename all blobs with matched version"""
+ # Operater list
+ for op, i_th_ssa in zip(proto.op, ssa):
+ versioned_inputs, versioned_outputs = i_th_ssa
+ for i in range(len(op.input)):
+ if versioned_inputs[i] == (old_name, version):
+ op.input[i] = new_name
+ for i in range(len(op.output)):
+ if versioned_outputs[i] == (old_name, version):
+ op.output[i] = new_name
+ # external_input
+ if start_versions.get(old_name, 0) == version:
+ for i in range(len(proto.external_input)):
+ if proto.external_input[i] == old_name:
+ proto.external_input[i] = new_name
+ # external_output
+ if end_versions.get(old_name, 0) == version:
+ for i in range(len(proto.external_output)):
+ if proto.external_output[i] == old_name:
+ proto.external_output[i] = new_name
+
+
+def rename_op_input(
+ predict_net: caffe2_pb2.NetDef,
+ init_net: caffe2_pb2.NetDef,
+ op_id: int,
+ input_id: int,
+ new_name: str,
+ from_producer: bool = False,
+):
+ """
+ Rename the op_id-th operator in predict_net, change it's input_id-th input's
+ name to the new_name. It also does automatic re-route and change
+ external_input and init_net if necessary.
+ - It requires the input is only consumed by this op.
+ - This function modifies predict_net and init_net in-place.
+ - When from_producer is enable, this also updates other operators that consumes
+ the same input. Be cautious because may trigger unintended behavior.
+ """
+ assert isinstance(predict_net, caffe2_pb2.NetDef)
+ assert isinstance(init_net, caffe2_pb2.NetDef)
+
+ init_net_ssa, init_net_versions = core.get_ssa(init_net)
+ predict_net_ssa, predict_net_versions = core.get_ssa(
+ predict_net, copy.deepcopy(init_net_versions)
+ )
+
+ versioned_inputs, versioned_outputs = predict_net_ssa[op_id]
+ old_name, version = versioned_inputs[input_id]
+
+ if from_producer:
+ producer_map = get_producer_map(predict_net_ssa)
+ if not (old_name, version) in producer_map:
+ raise NotImplementedError(
+ "Can't find producer, the input {} is probably from"
+ " init_net, this is not supported yet.".format(old_name)
+ )
+ producer = producer_map[(old_name, version)]
+ rename_op_output(predict_net, producer[0], producer[1], new_name)
+ return
+
+ def contain_targets(op_ssa):
+ return (old_name, version) in op_ssa[0]
+
+ is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa]
+ if sum(is_consumer) > 1:
+ raise IllegalGraphTransformError(
+ (
+ "Input '{}' of operator(#{}) are consumed by other ops, please use"
+ + " rename_op_output on the producer instead. Offending op: \n{}"
+ ).format(old_name, op_id, predict_net.op[op_id])
+ )
+
+ # update init_net
+ _rename_versioned_blob_in_proto(
+ init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions
+ )
+ # update predict_net
+ _rename_versioned_blob_in_proto(
+ predict_net,
+ old_name,
+ new_name,
+ version,
+ predict_net_ssa,
+ init_net_versions,
+ predict_net_versions,
+ )
+
+
+def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str):
+ """
+ Rename the op_id-th operator in predict_net, change it's output_id-th input's
+ name to the new_name. It also does automatic re-route and change
+ external_output and if necessary.
+ - It allows multiple consumers of its output.
+ - This function modifies predict_net in-place, doesn't need init_net.
+ """
+ assert isinstance(predict_net, caffe2_pb2.NetDef)
+
+ ssa, blob_versions = core.get_ssa(predict_net)
+
+ versioned_inputs, versioned_outputs = ssa[op_id]
+ old_name, version = versioned_outputs[output_id]
+
+ # update predict_net
+ _rename_versioned_blob_in_proto(
+ predict_net, old_name, new_name, version, ssa, {}, blob_versions
+ )
+
+
+def get_sub_graph_external_input_output(
+ predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int]
+) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
+ """
+ Return the list of external input/output of sub-graph,
+ each element is tuple of the name and corresponding version in predict_net.
+
+ external input/output is defined the same way as caffe2 NetDef.
+ """
+ ssa, versions = core.get_ssa(predict_net)
+
+ all_inputs = []
+ all_outputs = []
+ for op_id in sub_graph_op_indices:
+ all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs]
+ all_outputs += list(ssa[op_id][1]) # ssa output won't repeat
+
+ # for versioned blobs, external inputs are just those blob in all_inputs
+ # but not in all_outputs
+ ext_inputs = [inp for inp in all_inputs if inp not in all_outputs]
+
+ # external outputs are essentially outputs of this subgraph that are used
+ # outside of this sub-graph (including predict_net.external_output)
+ all_other_inputs = sum(
+ (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices),
+ [(outp, versions[outp]) for outp in predict_net.external_output],
+ )
+ ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)]
+
+ return ext_inputs, ext_outputs
+
+
+class DiGraph:
+ """A DAG representation of caffe2 graph, each vertice is a versioned blob."""
+
+ def __init__(self):
+ self.vertices = set()
+ self.graph = collections.defaultdict(list)
+
+ def add_edge(self, u, v):
+ self.graph[u].append(v)
+ self.vertices.add(u)
+ self.vertices.add(v)
+
+ # grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/
+ def get_all_paths(self, s, d):
+ visited = {k: False for k in self.vertices}
+ path = []
+ all_paths = []
+
+ def _get_all_paths_util(graph, u, d, visited, path):
+ visited[u] = True
+ path.append(u)
+ if u == d:
+ all_paths.append(copy.deepcopy(path))
+ else:
+ for i in graph[u]:
+ if not visited[i]:
+ _get_all_paths_util(graph, i, d, visited, path)
+ path.pop()
+ visited[u] = False
+
+ _get_all_paths_util(self.graph, s, d, visited, path)
+ return all_paths
+
+ @staticmethod
+ def from_ssa(ssa):
+ graph = DiGraph()
+ for op_id in range(len(ssa)):
+ for inp in ssa[op_id][0]:
+ for outp in ssa[op_id][1]:
+ graph.add_edge(inp, outp)
+ return graph
+
+
+def _get_dependency_chain(ssa, versioned_target, versioned_source):
+ """
+ Return the index list of relevant operator to produce target blob from source blob,
+ if there's no dependency, return empty list.
+ """
+
+ # finding all paths between nodes can be O(N!), thus we can only search
+ # in the subgraph using the op starting from the first consumer of source blob
+ # to the producer of the target blob.
+ consumer_map = get_consumer_map(ssa)
+ producer_map = get_producer_map(ssa)
+ start_op = min(x[0] for x in consumer_map[versioned_source]) - 15
+ end_op = (
+ producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op
+ )
+ sub_graph_ssa = ssa[start_op : end_op + 1]
+ if len(sub_graph_ssa) > 30:
+ logger.warning(
+ "Subgraph bebetween {} and {} is large (from op#{} to op#{}), it"
+ " might take non-trival time to find all paths between them.".format(
+ versioned_source, versioned_target, start_op, end_op
+ )
+ )
+
+ dag = DiGraph.from_ssa(sub_graph_ssa)
+ paths = dag.get_all_paths(versioned_source, versioned_target) # include two ends
+ ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths]
+ return sorted(set().union(*[set(ops) for ops in ops_in_paths]))
+
+
+def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]:
+ """
+ Idenfity the reshape sub-graph in a protobuf.
+ The reshape sub-graph is defined as matching the following pattern:
+
+ (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
+ └-------------------------------------------> Reshape -> (output_blob)
+
+ Return:
+ List of sub-graphs, each sub-graph is represented as a list of indices
+ of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
+ """
+
+ ssa, _ = core.get_ssa(predict_net)
+
+ ret = []
+ for i, op in enumerate(predict_net.op):
+ if op.type == "Reshape":
+ assert len(op.input) == 2
+ input_ssa = ssa[i][0]
+ data_source = input_ssa[0]
+ shape_source = input_ssa[1]
+ op_indices = _get_dependency_chain(ssa, shape_source, data_source)
+ ret.append(op_indices + [i])
+ return ret
+
+
+def remove_reshape_for_fc(predict_net, params):
+ """
+ In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
+ a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
+ doesn't work well with ONNX and Int8 tools, and cause using extra
+ ops (eg. ExpandDims) that might not be available on mobile.
+ Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
+ after exporting ONNX model.
+ """
+ from caffe2.python import core
+
+ # find all reshape sub-graph that can be removed, which is now all Reshape
+ # sub-graph whose output is only consumed by FC.
+ # TODO: to make it safer, we may need the actually value to better determine
+ # if a Reshape before FC is removable.
+ reshape_sub_graphs = identify_reshape_sub_graph(predict_net)
+ sub_graphs_to_remove = []
+ for reshape_sub_graph in reshape_sub_graphs:
+ reshape_op_id = reshape_sub_graph[-1]
+ assert predict_net.op[reshape_op_id].type == "Reshape"
+ ssa, _ = core.get_ssa(predict_net)
+ reshape_output = ssa[reshape_op_id][1][0]
+ consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]]
+ if all(predict_net.op[consumer].type == "FC" for consumer in consumers):
+ # safety check if the sub-graph is isolated, for this reshape sub-graph,
+ # it means it has one non-param external input and one external output.
+ ext_inputs, ext_outputs = get_sub_graph_external_input_output(
+ predict_net, reshape_sub_graph
+ )
+ non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+ if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1:
+ sub_graphs_to_remove.append(reshape_sub_graph)
+
+ # perform removing subgraph by:
+ # 1: rename the Reshape's output to its input, then the graph can be
+ # seen as in-place itentify, meaning whose external input/output are the same.
+ # 2: simply remove those ops.
+ remove_op_ids = []
+ params_to_remove = []
+ for sub_graph in sub_graphs_to_remove:
+ logger.info(
+ "Remove Reshape sub-graph:\n{}".format(
+ "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph])
+ )
+ )
+ reshape_op_id = sub_graph[-1]
+ new_reshap_output = predict_net.op[reshape_op_id].input[0]
+ rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output)
+ ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph)
+ non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+ params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0]
+ assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1
+ assert ext_outputs[0][0] == non_params_ext_inputs[0][0]
+ assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1
+ remove_op_ids.extend(sub_graph)
+ params_to_remove.extend(params_ext_inputs)
+
+ predict_net = copy.deepcopy(predict_net)
+ new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids]
+ del predict_net.op[:]
+ predict_net.op.extend(new_ops)
+ for versioned_params in params_to_remove:
+ name = versioned_params[0]
+ logger.info("Remove params: {} from init_net and predict_net.external_input".format(name))
+ del params[name]
+ predict_net.external_input.remove(name)
+
+ return predict_net, params
+
+
+def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef):
+ """
+ In-place fuse extra copy ops between cpu/gpu for the following case:
+ a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1
+ -CopyBToA> c2 -NextOp2-> d2
+ The fused network will look like:
+ a -NextOp1-> d1
+ -NextOp2-> d2
+ """
+
+ _COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"]
+
+ def _fuse_once(predict_net):
+ ssa, blob_versions = core.get_ssa(predict_net)
+ consumer_map = get_consumer_map(ssa)
+ versioned_external_output = [
+ (name, blob_versions[name]) for name in predict_net.external_output
+ ]
+
+ for op_id, op in enumerate(predict_net.op):
+ if op.type in _COPY_OPS:
+ fw_copy_versioned_output = ssa[op_id][1][0]
+ consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]]
+ reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)]
+
+ is_fusable = (
+ len(consumer_ids) > 0
+ and fw_copy_versioned_output not in versioned_external_output
+ and all(
+ predict_net.op[_op_id].type == reverse_op_type
+ and ssa[_op_id][1][0] not in versioned_external_output
+ for _op_id in consumer_ids
+ )
+ )
+
+ if is_fusable:
+ for rv_copy_op_id in consumer_ids:
+ # making each NextOp uses "a" directly and removing Copy ops
+ rs_copy_versioned_output = ssa[rv_copy_op_id][1][0]
+ next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0]
+ predict_net.op[next_op_id].input[inp_id] = op.input[0]
+ # remove CopyOps
+ new_ops = [
+ op
+ for i, op in enumerate(predict_net.op)
+ if i != op_id and i not in consumer_ids
+ ]
+ del predict_net.op[:]
+ predict_net.op.extend(new_ops)
+ return True
+
+ return False
+
+ # _fuse_once returns False is nothing can be fused
+ while _fuse_once(predict_net):
+ pass
+
+
+def remove_dead_end_ops(net_def: caffe2_pb2.NetDef):
+ """remove ops if its output is not used or not in external_output"""
+ ssa, versions = core.get_ssa(net_def)
+ versioned_external_output = [(name, versions[name]) for name in net_def.external_output]
+ consumer_map = get_consumer_map(ssa)
+ removed_op_ids = set()
+
+ def _is_dead_end(versioned_blob):
+ return not (
+ versioned_blob in versioned_external_output
+ or (
+ len(consumer_map[versioned_blob]) > 0
+ and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob])
+ )
+ )
+
+ for i, ssa_i in reversed(list(enumerate(ssa))):
+ versioned_outputs = ssa_i[1]
+ if all(_is_dead_end(outp) for outp in versioned_outputs):
+ removed_op_ids.add(i)
+
+ # simply removing those deadend ops should have no effect to external_output
+ new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids]
+ del net_def.op[:]
+ net_def.op.extend(new_ops)
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/torchscript.py b/comfyui_controlnet_aux/src/custom_detectron2/export/torchscript.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8eda3e6384fdcc3b092810808f192fa1f0194c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/torchscript.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import torch
+
+from custom_detectron2.utils.file_io import PathManager
+
+from .torchscript_patch import freeze_training_mode, patch_instances
+
+__all__ = ["scripting_with_instances", "dump_torchscript_IR"]
+
+
+def scripting_with_instances(model, fields):
+ """
+ Run :func:`torch.jit.script` on a model that uses the :class:`Instances` class. Since
+ attributes of :class:`Instances` are "dynamically" added in eager mode,it is difficult
+ for scripting to support it out of the box. This function is made to support scripting
+ a model that uses :class:`Instances`. It does the following:
+
+ 1. Create a scriptable ``new_Instances`` class which behaves similarly to ``Instances``,
+ but with all attributes been "static".
+ The attributes need to be statically declared in the ``fields`` argument.
+ 2. Register ``new_Instances``, and force scripting compiler to
+ use it when trying to compile ``Instances``.
+
+ After this function, the process will be reverted. User should be able to script another model
+ using different fields.
+
+ Example:
+ Assume that ``Instances`` in the model consist of two attributes named
+ ``proposal_boxes`` and ``objectness_logits`` with type :class:`Boxes` and
+ :class:`Tensor` respectively during inference. You can call this function like:
+ ::
+ fields = {"proposal_boxes": Boxes, "objectness_logits": torch.Tensor}
+ torchscipt_model = scripting_with_instances(model, fields)
+
+ Note:
+ It only support models in evaluation mode.
+
+ Args:
+ model (nn.Module): The input model to be exported by scripting.
+ fields (Dict[str, type]): Attribute names and corresponding type that
+ ``Instances`` will use in the model. Note that all attributes used in ``Instances``
+ need to be added, regardless of whether they are inputs/outputs of the model.
+ Data type not defined in detectron2 is not supported for now.
+
+ Returns:
+ torch.jit.ScriptModule: the model in torchscript format
+ """
+ assert (
+ not model.training
+ ), "Currently we only support exporting models in evaluation mode to torchscript"
+
+ with freeze_training_mode(model), patch_instances(fields):
+ scripted_model = torch.jit.script(model)
+ return scripted_model
+
+
+# alias for old name
+export_torchscript_with_instances = scripting_with_instances
+
+
+def dump_torchscript_IR(model, dir):
+ """
+ Dump IR of a TracedModule/ScriptModule/Function in various format (code, graph,
+ inlined graph). Useful for debugging.
+
+ Args:
+ model (TracedModule/ScriptModule/ScriptFUnction): traced or scripted module
+ dir (str): output directory to dump files.
+ """
+ dir = os.path.expanduser(dir)
+ PathManager.mkdirs(dir)
+
+ def _get_script_mod(mod):
+ if isinstance(mod, torch.jit.TracedModule):
+ return mod._actual_script_module
+ return mod
+
+ # Dump pretty-printed code: https://pytorch.org/docs/stable/jit.html#inspecting-code
+ with PathManager.open(os.path.join(dir, "model_ts_code.txt"), "w") as f:
+
+ def get_code(mod):
+ # Try a few ways to get code using private attributes.
+ try:
+ # This contains more information than just `mod.code`
+ return _get_script_mod(mod)._c.code
+ except AttributeError:
+ pass
+ try:
+ return mod.code
+ except AttributeError:
+ return None
+
+ def dump_code(prefix, mod):
+ code = get_code(mod)
+ name = prefix or "root model"
+ if code is None:
+ f.write(f"Could not found code for {name} (type={mod.original_name})\n")
+ f.write("\n")
+ else:
+ f.write(f"\nCode for {name}, type={mod.original_name}:\n")
+ f.write(code)
+ f.write("\n")
+ f.write("-" * 80)
+
+ for name, m in mod.named_children():
+ dump_code(prefix + "." + name, m)
+
+ if isinstance(model, torch.jit.ScriptFunction):
+ f.write(get_code(model))
+ else:
+ dump_code("", model)
+
+ def _get_graph(model):
+ try:
+ # Recursively dump IR of all modules
+ return _get_script_mod(model)._c.dump_to_str(True, False, False)
+ except AttributeError:
+ return model.graph.str()
+
+ with PathManager.open(os.path.join(dir, "model_ts_IR.txt"), "w") as f:
+ f.write(_get_graph(model))
+
+ # Dump IR of the entire graph (all submodules inlined)
+ with PathManager.open(os.path.join(dir, "model_ts_IR_inlined.txt"), "w") as f:
+ f.write(str(model.inlined_graph))
+
+ if not isinstance(model, torch.jit.ScriptFunction):
+ # Dump the model structure in pytorch style
+ with PathManager.open(os.path.join(dir, "model.txt"), "w") as f:
+ f.write(str(model))
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/export/torchscript_patch.py b/comfyui_controlnet_aux/src/custom_detectron2/export/torchscript_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..669bc9d337d4e0f6c2cfee8d33ff3b4240392aa0
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/export/torchscript_patch.py
@@ -0,0 +1,406 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import sys
+import tempfile
+from contextlib import ExitStack, contextmanager
+from copy import deepcopy
+from unittest import mock
+import torch
+from torch import nn
+
+# need some explicit imports due to https://github.com/pytorch/pytorch/issues/38964
+import custom_detectron2 # noqa F401
+from custom_detectron2.structures import Boxes, Instances
+from custom_detectron2.utils.env import _import_file
+
+_counter = 0
+
+
+def _clear_jit_cache():
+ from torch.jit._recursive import concrete_type_store
+ from torch.jit._state import _jit_caching_layer
+
+ concrete_type_store.type_store.clear() # for modules
+ _jit_caching_layer.clear() # for free functions
+
+
+def _add_instances_conversion_methods(newInstances):
+ """
+ Add from_instances methods to the scripted Instances class.
+ """
+ cls_name = newInstances.__name__
+
+ @torch.jit.unused
+ def from_instances(instances: Instances):
+ """
+ Create scripted Instances from original Instances
+ """
+ fields = instances.get_fields()
+ image_size = instances.image_size
+ ret = newInstances(image_size)
+ for name, val in fields.items():
+ assert hasattr(ret, f"_{name}"), f"No attribute named {name} in {cls_name}"
+ setattr(ret, name, deepcopy(val))
+ return ret
+
+ newInstances.from_instances = from_instances
+
+
+@contextmanager
+def patch_instances(fields):
+ """
+ A contextmanager, under which the Instances class in detectron2 is replaced
+ by a statically-typed scriptable class, defined by `fields`.
+ See more in `scripting_with_instances`.
+ """
+
+ with tempfile.TemporaryDirectory(prefix="detectron2") as dir, tempfile.NamedTemporaryFile(
+ mode="w", encoding="utf-8", suffix=".py", dir=dir, delete=False
+ ) as f:
+ try:
+ # Objects that use Instances should not reuse previously-compiled
+ # results in cache, because `Instances` could be a new class each time.
+ _clear_jit_cache()
+
+ cls_name, s = _gen_instance_module(fields)
+ f.write(s)
+ f.flush()
+ f.close()
+
+ module = _import(f.name)
+ new_instances = getattr(module, cls_name)
+ _ = torch.jit.script(new_instances)
+ # let torchscript think Instances was scripted already
+ Instances.__torch_script_class__ = True
+ # let torchscript find new_instances when looking for the jit type of Instances
+ Instances._jit_override_qualname = torch._jit_internal._qualified_name(new_instances)
+
+ _add_instances_conversion_methods(new_instances)
+ yield new_instances
+ finally:
+ try:
+ del Instances.__torch_script_class__
+ del Instances._jit_override_qualname
+ except AttributeError:
+ pass
+ sys.modules.pop(module.__name__)
+
+
+def _gen_instance_class(fields):
+ """
+ Args:
+ fields (dict[name: type])
+ """
+
+ class _FieldType:
+ def __init__(self, name, type_):
+ assert isinstance(name, str), f"Field name must be str, got {name}"
+ self.name = name
+ self.type_ = type_
+ self.annotation = f"{type_.__module__}.{type_.__name__}"
+
+ fields = [_FieldType(k, v) for k, v in fields.items()]
+
+ def indent(level, s):
+ return " " * 4 * level + s
+
+ lines = []
+
+ global _counter
+ _counter += 1
+
+ cls_name = "ScriptedInstances{}".format(_counter)
+
+ field_names = tuple(x.name for x in fields)
+ extra_args = ", ".join([f"{f.name}: Optional[{f.annotation}] = None" for f in fields])
+ lines.append(
+ f"""
+class {cls_name}:
+ def __init__(self, image_size: Tuple[int, int], {extra_args}):
+ self.image_size = image_size
+ self._field_names = {field_names}
+"""
+ )
+
+ for f in fields:
+ lines.append(
+ indent(2, f"self._{f.name} = torch.jit.annotate(Optional[{f.annotation}], {f.name})")
+ )
+
+ for f in fields:
+ lines.append(
+ f"""
+ @property
+ def {f.name}(self) -> {f.annotation}:
+ # has to use a local for type refinement
+ # https://pytorch.org/docs/stable/jit_language_reference.html#optional-type-refinement
+ t = self._{f.name}
+ assert t is not None, "{f.name} is None and cannot be accessed!"
+ return t
+
+ @{f.name}.setter
+ def {f.name}(self, value: {f.annotation}) -> None:
+ self._{f.name} = value
+"""
+ )
+
+ # support method `__len__`
+ lines.append(
+ """
+ def __len__(self) -> int:
+"""
+ )
+ for f in fields:
+ lines.append(
+ f"""
+ t = self._{f.name}
+ if t is not None:
+ return len(t)
+"""
+ )
+ lines.append(
+ """
+ raise NotImplementedError("Empty Instances does not support __len__!")
+"""
+ )
+
+ # support method `has`
+ lines.append(
+ """
+ def has(self, name: str) -> bool:
+"""
+ )
+ for f in fields:
+ lines.append(
+ f"""
+ if name == "{f.name}":
+ return self._{f.name} is not None
+"""
+ )
+ lines.append(
+ """
+ return False
+"""
+ )
+
+ # support method `to`
+ none_args = ", None" * len(fields)
+ lines.append(
+ f"""
+ def to(self, device: torch.device) -> "{cls_name}":
+ ret = {cls_name}(self.image_size{none_args})
+"""
+ )
+ for f in fields:
+ if hasattr(f.type_, "to"):
+ lines.append(
+ f"""
+ t = self._{f.name}
+ if t is not None:
+ ret._{f.name} = t.to(device)
+"""
+ )
+ else:
+ # For now, ignore fields that cannot be moved to devices.
+ # Maybe can support other tensor-like classes (e.g. __torch_function__)
+ pass
+ lines.append(
+ """
+ return ret
+"""
+ )
+
+ # support method `getitem`
+ none_args = ", None" * len(fields)
+ lines.append(
+ f"""
+ def __getitem__(self, item) -> "{cls_name}":
+ ret = {cls_name}(self.image_size{none_args})
+"""
+ )
+ for f in fields:
+ lines.append(
+ f"""
+ t = self._{f.name}
+ if t is not None:
+ ret._{f.name} = t[item]
+"""
+ )
+ lines.append(
+ """
+ return ret
+"""
+ )
+
+ # support method `cat`
+ # this version does not contain checks that all instances have same size and fields
+ none_args = ", None" * len(fields)
+ lines.append(
+ f"""
+ def cat(self, instances: List["{cls_name}"]) -> "{cls_name}":
+ ret = {cls_name}(self.image_size{none_args})
+"""
+ )
+ for f in fields:
+ lines.append(
+ f"""
+ t = self._{f.name}
+ if t is not None:
+ values: List[{f.annotation}] = [x.{f.name} for x in instances]
+ if torch.jit.isinstance(t, torch.Tensor):
+ ret._{f.name} = torch.cat(values, dim=0)
+ else:
+ ret._{f.name} = t.cat(values)
+"""
+ )
+ lines.append(
+ """
+ return ret"""
+ )
+
+ # support method `get_fields()`
+ lines.append(
+ """
+ def get_fields(self) -> Dict[str, Tensor]:
+ ret = {}
+ """
+ )
+ for f in fields:
+ if f.type_ == Boxes:
+ stmt = "t.tensor"
+ elif f.type_ == torch.Tensor:
+ stmt = "t"
+ else:
+ stmt = f'assert False, "unsupported type {str(f.type_)}"'
+ lines.append(
+ f"""
+ t = self._{f.name}
+ if t is not None:
+ ret["{f.name}"] = {stmt}
+ """
+ )
+ lines.append(
+ """
+ return ret"""
+ )
+ return cls_name, os.linesep.join(lines)
+
+
+def _gen_instance_module(fields):
+ # TODO: find a more automatic way to enable import of other classes
+ s = """
+from copy import deepcopy
+import torch
+from torch import Tensor
+import typing
+from typing import *
+
+import custom_detectron2
+from custom_detectron2.structures import Boxes, Instances
+
+"""
+
+ cls_name, cls_def = _gen_instance_class(fields)
+ s += cls_def
+ return cls_name, s
+
+
+def _import(path):
+ return _import_file(
+ "{}{}".format(sys.modules[__name__].__name__, _counter), path, make_importable=True
+ )
+
+
+@contextmanager
+def patch_builtin_len(modules=()):
+ """
+ Patch the builtin len() function of a few detectron2 modules
+ to use __len__ instead, because __len__ does not convert values to
+ integers and therefore is friendly to tracing.
+
+ Args:
+ modules (list[stsr]): names of extra modules to patch len(), in
+ addition to those in detectron2.
+ """
+
+ def _new_len(obj):
+ return obj.__len__()
+
+ with ExitStack() as stack:
+ MODULES = [
+ "detectron2.modeling.roi_heads.fast_rcnn",
+ "detectron2.modeling.roi_heads.mask_head",
+ "detectron2.modeling.roi_heads.keypoint_head",
+ ] + list(modules)
+ ctxs = [stack.enter_context(mock.patch(mod + ".len")) for mod in MODULES]
+ for m in ctxs:
+ m.side_effect = _new_len
+ yield
+
+
+def patch_nonscriptable_classes():
+ """
+ Apply patches on a few nonscriptable detectron2 classes.
+ Should not have side-effects on eager usage.
+ """
+ # __prepare_scriptable__ can also be added to models for easier maintenance.
+ # But it complicates the clean model code.
+
+ from custom_detectron2.modeling.backbone import ResNet, FPN
+
+ # Due to https://github.com/pytorch/pytorch/issues/36061,
+ # we change backbone to use ModuleList for scripting.
+ # (note: this changes param names in state_dict)
+
+ def prepare_resnet(self):
+ ret = deepcopy(self)
+ ret.stages = nn.ModuleList(ret.stages)
+ for k in self.stage_names:
+ delattr(ret, k)
+ return ret
+
+ ResNet.__prepare_scriptable__ = prepare_resnet
+
+ def prepare_fpn(self):
+ ret = deepcopy(self)
+ ret.lateral_convs = nn.ModuleList(ret.lateral_convs)
+ ret.output_convs = nn.ModuleList(ret.output_convs)
+ for name, _ in self.named_children():
+ if name.startswith("fpn_"):
+ delattr(ret, name)
+ return ret
+
+ FPN.__prepare_scriptable__ = prepare_fpn
+
+ # Annotate some attributes to be constants for the purpose of scripting,
+ # even though they are not constants in eager mode.
+ from custom_detectron2.modeling.roi_heads import StandardROIHeads
+
+ if hasattr(StandardROIHeads, "__annotations__"):
+ # copy first to avoid editing annotations of base class
+ StandardROIHeads.__annotations__ = deepcopy(StandardROIHeads.__annotations__)
+ StandardROIHeads.__annotations__["mask_on"] = torch.jit.Final[bool]
+ StandardROIHeads.__annotations__["keypoint_on"] = torch.jit.Final[bool]
+
+
+# These patches are not supposed to have side-effects.
+patch_nonscriptable_classes()
+
+
+@contextmanager
+def freeze_training_mode(model):
+ """
+ A context manager that annotates the "training" attribute of every submodule
+ to constant, so that the training codepath in these modules can be
+ meta-compiled away. Upon exiting, the annotations are reverted.
+ """
+ classes = {type(x) for x in model.modules()}
+ # __constants__ is the old way to annotate constants and not compatible
+ # with __annotations__ .
+ classes = {x for x in classes if not hasattr(x, "__constants__")}
+ for cls in classes:
+ cls.__annotations__["training"] = torch.jit.Final[bool]
+ yield
+ for cls in classes:
+ cls.__annotations__["training"] = bool
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/__init__.py b/comfyui_controlnet_aux/src/custom_detectron2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..341c9dbdd2477c6e19710e87a764b4d1e47676c8
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm, CycleBatchNormList
+from .deform_conv import DeformConv, ModulatedDeformConv
+from .mask_ops import paste_masks_in_image
+from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
+from .roi_align import ROIAlign, roi_align
+from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
+from .shape_spec import ShapeSpec
+from .wrappers import (
+ BatchNorm2d,
+ Conv2d,
+ ConvTranspose2d,
+ cat,
+ interpolate,
+ Linear,
+ nonzero_tuple,
+ cross_entropy,
+ empty_input_loss_func_wrapper,
+ shapes_to_tensor,
+ move_device_like,
+)
+from .blocks import CNNBlockBase, DepthwiseSeparableConv2d
+from .aspp import ASPP
+from .losses import ciou_loss, diou_loss
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/aspp.py b/comfyui_controlnet_aux/src/custom_detectron2/layers/aspp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f572338c880e033052ada48c1ce2cf6b59012bea
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/aspp.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from copy import deepcopy
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .batch_norm import get_norm
+from .blocks import DepthwiseSeparableConv2d
+from .wrappers import Conv2d
+
+
+class ASPP(nn.Module):
+ """
+ Atrous Spatial Pyramid Pooling (ASPP).
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ dilations,
+ *,
+ norm,
+ activation,
+ pool_kernel_size=None,
+ dropout: float = 0.0,
+ use_depthwise_separable_conv=False,
+ ):
+ """
+ Args:
+ in_channels (int): number of input channels for ASPP.
+ out_channels (int): number of output channels.
+ dilations (list): a list of 3 dilations in ASPP.
+ norm (str or callable): normalization for all conv layers.
+ See :func:`layers.get_norm` for supported format. norm is
+ applied to all conv layers except the conv following
+ global average pooling.
+ activation (callable): activation function.
+ pool_kernel_size (tuple, list): the average pooling size (kh, kw)
+ for image pooling layer in ASPP. If set to None, it always
+ performs global average pooling. If not None, it must be
+ divisible by the shape of inputs in forward(). It is recommended
+ to use a fixed input feature size in training, and set this
+ option to match this size, so that it performs global average
+ pooling in training, and the size of the pooling window stays
+ consistent in inference.
+ dropout (float): apply dropout on the output of ASPP. It is used in
+ the official DeepLab implementation with a rate of 0.1:
+ https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532 # noqa
+ use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
+ for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
+ """
+ super(ASPP, self).__init__()
+ assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
+ self.pool_kernel_size = pool_kernel_size
+ self.dropout = dropout
+ use_bias = norm == ""
+ self.convs = nn.ModuleList()
+ # conv 1x1
+ self.convs.append(
+ Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ bias=use_bias,
+ norm=get_norm(norm, out_channels),
+ activation=deepcopy(activation),
+ )
+ )
+ weight_init.c2_xavier_fill(self.convs[-1])
+ # atrous convs
+ for dilation in dilations:
+ if use_depthwise_separable_conv:
+ self.convs.append(
+ DepthwiseSeparableConv2d(
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ padding=dilation,
+ dilation=dilation,
+ norm1=norm,
+ activation1=deepcopy(activation),
+ norm2=norm,
+ activation2=deepcopy(activation),
+ )
+ )
+ else:
+ self.convs.append(
+ Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ padding=dilation,
+ dilation=dilation,
+ bias=use_bias,
+ norm=get_norm(norm, out_channels),
+ activation=deepcopy(activation),
+ )
+ )
+ weight_init.c2_xavier_fill(self.convs[-1])
+ # image pooling
+ # We do not add BatchNorm because the spatial resolution is 1x1,
+ # the original TF implementation has BatchNorm.
+ if pool_kernel_size is None:
+ image_pooling = nn.Sequential(
+ nn.AdaptiveAvgPool2d(1),
+ Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+ )
+ else:
+ image_pooling = nn.Sequential(
+ nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
+ Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+ )
+ weight_init.c2_xavier_fill(image_pooling[1])
+ self.convs.append(image_pooling)
+
+ self.project = Conv2d(
+ 5 * out_channels,
+ out_channels,
+ kernel_size=1,
+ bias=use_bias,
+ norm=get_norm(norm, out_channels),
+ activation=deepcopy(activation),
+ )
+ weight_init.c2_xavier_fill(self.project)
+
+ def forward(self, x):
+ size = x.shape[-2:]
+ if self.pool_kernel_size is not None:
+ if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
+ raise ValueError(
+ "`pool_kernel_size` must be divisible by the shape of inputs. "
+ "Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
+ )
+ res = []
+ for conv in self.convs:
+ res.append(conv(x))
+ res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
+ res = torch.cat(res, dim=1)
+ res = self.project(res)
+ res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
+ return res
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/batch_norm.py b/comfyui_controlnet_aux/src/custom_detectron2/layers/batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebfa104a01b79ae9a141d4634c6b3443610dfbc9
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/batch_norm.py
@@ -0,0 +1,300 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.distributed as dist
+from fvcore.nn.distributed import differentiable_all_reduce
+from torch import nn
+from torch.nn import functional as F
+
+from custom_detectron2.utils import comm, env
+
+from .wrappers import BatchNorm2d
+
+
+class FrozenBatchNorm2d(nn.Module):
+ """
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+ It contains non-trainable buffers called
+ "weight" and "bias", "running_mean", "running_var",
+ initialized to perform identity transformation.
+
+ The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+ which are computed from the original four parameters of BN.
+ The affine transform `x * weight + bias` will perform the equivalent
+ computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+ When loading a backbone model from Caffe2, "running_mean" and "running_var"
+ will be left unchanged as identity transformation.
+
+ Other pre-trained backbone models may contain all 4 parameters.
+
+ The forward is implemented by `F.batch_norm(..., training=False)`.
+ """
+
+ _version = 3
+
+ def __init__(self, num_features, eps=1e-5):
+ super().__init__()
+ self.num_features = num_features
+ self.eps = eps
+ self.register_buffer("weight", torch.ones(num_features))
+ self.register_buffer("bias", torch.zeros(num_features))
+ self.register_buffer("running_mean", torch.zeros(num_features))
+ self.register_buffer("running_var", torch.ones(num_features) - eps)
+
+ def forward(self, x):
+ if x.requires_grad:
+ # When gradients are needed, F.batch_norm will use extra memory
+ # because its backward op computes gradients for weight/bias as well.
+ scale = self.weight * (self.running_var + self.eps).rsqrt()
+ bias = self.bias - self.running_mean * scale
+ scale = scale.reshape(1, -1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1)
+ out_dtype = x.dtype # may be half
+ return x * scale.to(out_dtype) + bias.to(out_dtype)
+ else:
+ # When gradients are not needed, F.batch_norm is a single fused op
+ # and provide more optimization opportunities.
+ return F.batch_norm(
+ x,
+ self.running_mean,
+ self.running_var,
+ self.weight,
+ self.bias,
+ training=False,
+ eps=self.eps,
+ )
+
+ def _load_from_state_dict(
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ ):
+ version = local_metadata.get("version", None)
+
+ if version is None or version < 2:
+ # No running_mean/var in early versions
+ # This will silent the warnings
+ if prefix + "running_mean" not in state_dict:
+ state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+ if prefix + "running_var" not in state_dict:
+ state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+ super()._load_from_state_dict(
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ )
+
+ def __repr__(self):
+ return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+ @classmethod
+ def convert_frozen_batchnorm(cls, module):
+ """
+ Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+ Args:
+ module (torch.nn.Module):
+
+ Returns:
+ If module is BatchNorm/SyncBatchNorm, returns a new module.
+ Otherwise, in-place convert module and return it.
+
+ Similar to convert_sync_batchnorm in
+ https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+ """
+ bn_module = nn.modules.batchnorm
+ bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+ res = module
+ if isinstance(module, bn_module):
+ res = cls(module.num_features)
+ if module.affine:
+ res.weight.data = module.weight.data.clone().detach()
+ res.bias.data = module.bias.data.clone().detach()
+ res.running_mean.data = module.running_mean.data
+ res.running_var.data = module.running_var.data
+ res.eps = module.eps
+ else:
+ for name, child in module.named_children():
+ new_child = cls.convert_frozen_batchnorm(child)
+ if new_child is not child:
+ res.add_module(name, new_child)
+ return res
+
+
+def get_norm(norm, out_channels):
+ """
+ Args:
+ norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+ or a callable that takes a channel number and returns
+ the normalization layer as a nn.Module.
+
+ Returns:
+ nn.Module or None: the normalization layer
+ """
+ if norm is None:
+ return None
+ if isinstance(norm, str):
+ if len(norm) == 0:
+ return None
+ norm = {
+ "BN": BatchNorm2d,
+ # Fixed in https://github.com/pytorch/pytorch/pull/36382
+ "SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+ "FrozenBN": FrozenBatchNorm2d,
+ "GN": lambda channels: nn.GroupNorm(32, channels),
+ # for debugging:
+ "nnSyncBN": nn.SyncBatchNorm,
+ "naiveSyncBN": NaiveSyncBatchNorm,
+ # expose stats_mode N as an option to caller, required for zero-len inputs
+ "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
+ "LN": lambda channels: LayerNorm(channels),
+ }[norm]
+ return norm(out_channels)
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+ """
+ In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+ when the batch size on each worker is different.
+ (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+ This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+ Note:
+ There isn't a single definition of Sync BatchNorm.
+
+ When ``stats_mode==""``, this module computes overall statistics by using
+ statistics of each worker with equal weight. The result is true statistics
+ of all samples (as if they are all on one worker) only when all workers
+ have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+ When ``stats_mode=="N"``, this module computes overall statistics by weighting
+ the statistics of each worker by their ``N``. The result is true statistics
+ of all samples (as if they are all on one worker) only when all workers
+ have the same (H, W). It is slower than ``stats_mode==""``.
+
+ Even though the result of this module may not be the true statistics of all samples,
+ it may still be reasonable because it might be preferrable to assign equal weights
+ to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+ on larger images. From preliminary experiments, little difference is found between such
+ a simplified implementation and an accurate computation of overall mean & variance.
+ """
+
+ def __init__(self, *args, stats_mode="", **kwargs):
+ super().__init__(*args, **kwargs)
+ assert stats_mode in ["", "N"]
+ self._stats_mode = stats_mode
+
+ def forward(self, input):
+ if comm.get_world_size() == 1 or not self.training:
+ return super().forward(input)
+
+ B, C = input.shape[0], input.shape[1]
+
+ half_input = input.dtype == torch.float16
+ if half_input:
+ # fp16 does not have good enough numerics for the reduction here
+ input = input.float()
+ mean = torch.mean(input, dim=[0, 2, 3])
+ meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+ if self._stats_mode == "":
+ assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+ vec = torch.cat([mean, meansqr], dim=0)
+ vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+ mean, meansqr = torch.split(vec, C)
+ momentum = self.momentum
+ else:
+ if B == 0:
+ vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+ vec = vec + input.sum() # make sure there is gradient w.r.t input
+ else:
+ vec = torch.cat(
+ [mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0
+ )
+ vec = differentiable_all_reduce(vec * B)
+
+ total_batch = vec[-1].detach()
+ momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0
+ mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C) # avoid div-by-zero
+
+ var = meansqr - mean * mean
+ invstd = torch.rsqrt(var + self.eps)
+ scale = self.weight * invstd
+ bias = self.bias - mean * scale
+ scale = scale.reshape(1, -1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1)
+
+ self.running_mean += momentum * (mean.detach() - self.running_mean)
+ self.running_var += momentum * (var.detach() - self.running_var)
+ ret = input * scale + bias
+ if half_input:
+ ret = ret.half()
+ return ret
+
+
+class CycleBatchNormList(nn.ModuleList):
+ """
+ Implement domain-specific BatchNorm by cycling.
+
+ When a BatchNorm layer is used for multiple input domains or input
+ features, it might need to maintain a separate test-time statistics
+ for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
+
+ This module implements it by using N separate BN layers
+ and it cycles through them every time a forward() is called.
+
+ NOTE: The caller of this module MUST guarantee to always call
+ this module by multiple of N times. Otherwise its test-time statistics
+ will be incorrect.
+ """
+
+ def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
+ """
+ Args:
+ length: number of BatchNorm layers to cycle.
+ bn_class: the BatchNorm class to use
+ kwargs: arguments of the BatchNorm class, such as num_features.
+ """
+ self._affine = kwargs.pop("affine", True)
+ super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
+ if self._affine:
+ # shared affine, domain-specific BN
+ channels = self[0].num_features
+ self.weight = nn.Parameter(torch.ones(channels))
+ self.bias = nn.Parameter(torch.zeros(channels))
+ self._pos = 0
+
+ def forward(self, x):
+ ret = self[self._pos](x)
+ self._pos = (self._pos + 1) % len(self)
+
+ if self._affine:
+ w = self.weight.reshape(1, -1, 1, 1)
+ b = self.bias.reshape(1, -1, 1, 1)
+ return ret * w + b
+ else:
+ return ret
+
+ def extra_repr(self):
+ return f"affine={self._affine}"
+
+
+class LayerNorm(nn.Module):
+ """
+ A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+ variance normalization over the channel dimension for inputs that have shape
+ (batch_size, channels, height, width).
+ https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa B950
+ """
+
+ def __init__(self, normalized_shape, eps=1e-6):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(normalized_shape))
+ self.bias = nn.Parameter(torch.zeros(normalized_shape))
+ self.eps = eps
+ self.normalized_shape = (normalized_shape,)
+
+ def forward(self, x):
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/blocks.py b/comfyui_controlnet_aux/src/custom_detectron2/layers/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e480f5cea587bea00d960925a669befa7ec67cbb
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/blocks.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+
+from .batch_norm import FrozenBatchNorm2d, get_norm
+from .wrappers import Conv2d
+
+
+"""
+CNN building blocks.
+"""
+
+
+class CNNBlockBase(nn.Module):
+ """
+ A CNN block is assumed to have input channels, output channels and a stride.
+ The input and output of `forward()` method must be NCHW tensors.
+ The method can perform arbitrary computation but must match the given
+ channels and stride specification.
+
+ Attribute:
+ in_channels (int):
+ out_channels (int):
+ stride (int):
+ """
+
+ def __init__(self, in_channels, out_channels, stride):
+ """
+ The `__init__` method of any subclass should also contain these arguments.
+
+ Args:
+ in_channels (int):
+ out_channels (int):
+ stride (int):
+ """
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.stride = stride
+
+ def freeze(self):
+ """
+ Make this block not trainable.
+ This method sets all parameters to `requires_grad=False`,
+ and convert all BatchNorm layers to FrozenBatchNorm
+
+ Returns:
+ the block itself
+ """
+ for p in self.parameters():
+ p.requires_grad = False
+ FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+ return self
+
+
+class DepthwiseSeparableConv2d(nn.Module):
+ """
+ A kxk depthwise convolution + a 1x1 convolution.
+
+ In :paper:`xception`, norm & activation are applied on the second conv.
+ :paper:`mobilenet` uses norm & activation on both convs.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ padding=1,
+ dilation=1,
+ *,
+ norm1=None,
+ activation1=None,
+ norm2=None,
+ activation2=None,
+ ):
+ """
+ Args:
+ norm1, norm2 (str or callable): normalization for the two conv layers.
+ activation1, activation2 (callable(Tensor) -> Tensor): activation
+ function for the two conv layers.
+ """
+ super().__init__()
+ self.depthwise = Conv2d(
+ in_channels,
+ in_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ dilation=dilation,
+ groups=in_channels,
+ bias=not norm1,
+ norm=get_norm(norm1, in_channels),
+ activation=activation1,
+ )
+ self.pointwise = Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ bias=not norm2,
+ norm=get_norm(norm2, out_channels),
+ activation=activation2,
+ )
+
+ # default initialization
+ weight_init.c2_msra_fill(self.depthwise)
+ weight_init.c2_msra_fill(self.pointwise)
+
+ def forward(self, x):
+ return self.pointwise(self.depthwise(x))
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/README.md b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..56a931abadeda8cc044dd243425fc88457208615
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/README.md
@@ -0,0 +1,7 @@
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..220146b533bf6faeae041dced48508202bd1e4ac
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
@@ -0,0 +1,115 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor ROIAlignRotated_forward_cuda(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const double spatial_scale,
+ const int64_t pooled_height,
+ const int64_t pooled_width,
+ const int64_t sampling_ratio) {
+ if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+ return ROIAlignRotated_forward_cuda(
+ input,
+ rois,
+ spatial_scale,
+ pooled_height,
+ pooled_width,
+ sampling_ratio);
+#else
+ AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+ }
+ return ROIAlignRotated_forward_cpu(
+ input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const double spatial_scale,
+ const int64_t pooled_height,
+ const int64_t pooled_width,
+ const int64_t batch_size,
+ const int64_t channels,
+ const int64_t height,
+ const int64_t width,
+ const int64_t sampling_ratio) {
+ if (grad.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+ return ROIAlignRotated_backward_cuda(
+ grad,
+ rois,
+ spatial_scale,
+ pooled_height,
+ pooled_width,
+ batch_size,
+ channels,
+ height,
+ width,
+ sampling_ratio);
+#else
+ AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+ }
+ return ROIAlignRotated_backward_cpu(
+ grad,
+ rois,
+ spatial_scale,
+ pooled_height,
+ pooled_width,
+ batch_size,
+ channels,
+ height,
+ width,
+ sampling_ratio);
+}
+
+} // namespace detectron2
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..341f494c1f29003efe34ba3b6eae88ffd1b1f328
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
@@ -0,0 +1,522 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template
+struct PreCalc {
+ int pos1;
+ int pos2;
+ int pos3;
+ int pos4;
+ T w1;
+ T w2;
+ T w3;
+ T w4;
+};
+
+template
+void pre_calc_for_bilinear_interpolate(
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int iy_upper,
+ const int ix_upper,
+ T roi_start_h,
+ T roi_start_w,
+ T bin_size_h,
+ T bin_size_w,
+ int roi_bin_grid_h,
+ int roi_bin_grid_w,
+ T roi_center_h,
+ T roi_center_w,
+ T cos_theta,
+ T sin_theta,
+ std::vector>& pre_calc) {
+ int pre_calc_index = 0;
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ for (int iy = 0; iy < iy_upper; iy++) {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < ix_upper; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ // In image space, (y, x) is the order for Right Handed System,
+ // and this is essentially multiplying the point by a rotation matrix
+ // to rotate it counterclockwise through angle theta.
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+ // deal with: inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ PreCalc pc;
+ pc.pos1 = 0;
+ pc.pos2 = 0;
+ pc.pos3 = 0;
+ pc.pos4 = 0;
+ pc.w1 = 0;
+ pc.w2 = 0;
+ pc.w3 = 0;
+ pc.w4 = 0;
+ pre_calc[pre_calc_index] = pc;
+ pre_calc_index += 1;
+ continue;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+ if (x < 0) {
+ x = 0;
+ }
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ // save weights and indices
+ PreCalc pc;
+ pc.pos1 = y_low * width + x_low;
+ pc.pos2 = y_low * width + x_high;
+ pc.pos3 = y_high * width + x_low;
+ pc.pos4 = y_high * width + x_high;
+ pc.w1 = w1;
+ pc.w2 = w2;
+ pc.w3 = w3;
+ pc.w4 = w4;
+ pre_calc[pre_calc_index] = pc;
+
+ pre_calc_index += 1;
+ }
+ }
+ }
+ }
+}
+
+template
+void bilinear_interpolate_gradient(
+ const int height,
+ const int width,
+ T y,
+ T x,
+ T& w1,
+ T& w2,
+ T& w3,
+ T& w4,
+ int& x_low,
+ int& x_high,
+ int& y_low,
+ int& y_high) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+
+ if (x < 0) {
+ x = 0;
+ }
+
+ y_low = (int)y;
+ x_low = (int)x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = input[y_low * width + x_low];
+ // T v2 = input[y_low * width + x_high];
+ // T v3 = input[y_high * width + x_low];
+ // T v4 = input[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+
+template
+inline void add(T* address, const T& val) {
+ *address += val;
+}
+
+} // namespace
+
+template
+void ROIAlignRotatedForward(
+ const int nthreads,
+ const T* input,
+ const T& spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ const T* rois,
+ T* output) {
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
+ // (n, c, ph, pw) is an element in the pooled output
+ // can be parallelized using omp
+ // #pragma omp parallel for num_threads(32)
+ for (int n = 0; n < n_rois; n++) {
+ int index_n = n * channels * pooled_width * pooled_height;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ AT_ASSERTM(
+ roi_width >= 0 && roi_height >= 0,
+ "ROIs in ROIAlignRotated do not have non-negative size!");
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+ // we want to precalculate indices and weights shared by all channels,
+ // this is the key point of optimization
+ std::vector> pre_calc(
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ pre_calc_for_bilinear_interpolate(
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ roi_start_h,
+ roi_start_w,
+ bin_size_h,
+ bin_size_w,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ roi_center_h,
+ roi_center_w,
+ cos_theta,
+ sin_theta,
+ pre_calc);
+
+ for (int c = 0; c < channels; c++) {
+ int index_n_c = index_n + c * pooled_width * pooled_height;
+ const T* offset_input =
+ input + (roi_batch_ind * channels + c) * height * width;
+ int pre_calc_index = 0;
+
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ int index = index_n_c + ph * pooled_width + pw;
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ PreCalc pc = pre_calc[pre_calc_index];
+ output_val += pc.w1 * offset_input[pc.pos1] +
+ pc.w2 * offset_input[pc.pos2] +
+ pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+ pre_calc_index += 1;
+ }
+ }
+ output_val /= count;
+
+ output[index] = output_val;
+ } // for pw
+ } // for ph
+ } // for c
+ } // for n
+}
+
+template
+void ROIAlignRotatedBackward(
+ const int nthreads,
+ // may not be contiguous. should index using n_stride, etc
+ const T* grad_output,
+ const T& spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ T* grad_input,
+ const T* rois,
+ const int n_stride,
+ const int c_stride,
+ const int h_stride,
+ const int w_stride) {
+ for (int index = 0; index < nthreads; index++) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ AT_ASSERTM(
+ roi_width >= 0 && roi_height >= 0,
+ "ROIs in ROIAlignRotated do not have non-negative size!");
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T* offset_grad_input =
+ grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+ int output_offset = n * n_stride + c * c_stride;
+ const T* offset_grad_output = grad_output + output_offset;
+ const T grad_output_this_bin =
+ offset_grad_output[ph * h_stride + pw * w_stride];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(
+ height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+ T g1 = grad_output_this_bin * w1 / count;
+ T g2 = grad_output_this_bin * w2 / count;
+ T g3 = grad_output_this_bin * w3 / count;
+ T g4 = grad_output_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+ // atomic add is not needed for now since it is single threaded
+ add(offset_grad_input + y_low * width + x_low, static_cast(g1));
+ add(offset_grad_input + y_low * width + x_high, static_cast(g2));
+ add(offset_grad_input + y_high * width + x_low, static_cast(g3));
+ add(offset_grad_input + y_high * width + x_high, static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+ AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+ at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+ at::CheckedFrom c = "ROIAlign_forward_cpu";
+ at::checkAllSameType(c, {input_t, rois_t});
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ at::Tensor output = at::zeros(
+ {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+ if (output.numel() == 0) {
+ return output;
+ }
+
+ auto input_ = input.contiguous(), rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ input.scalar_type(), "ROIAlignRotated_forward", [&] {
+ ROIAlignRotatedForward(
+ output_size,
+ input_.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois_.data_ptr(),
+ output.data_ptr());
+ });
+ return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+ AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+ at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+ at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+ at::checkAllSameType(c, {grad_t, rois_t});
+
+ at::Tensor grad_input =
+ at::zeros({batch_size, channels, height, width}, grad.options());
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ return grad_input;
+ }
+
+ // get stride values to ensure indexing into gradients is correct.
+ int n_stride = grad.stride(0);
+ int c_stride = grad.stride(1);
+ int h_stride = grad.stride(2);
+ int w_stride = grad.stride(3);
+
+ auto rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ grad.scalar_type(), "ROIAlignRotated_forward", [&] {
+ ROIAlignRotatedBackward(
+ grad.numel(),
+ grad.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ grad_input.data_ptr(),
+ rois_.data_ptr(),
+ n_stride,
+ c_stride,
+ h_stride,
+ w_stride);
+ });
+ return grad_input;
+}
+
+} // namespace detectron2
diff --git a/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e55c572547e455c87b827f6781f1dbc306d368c
--- /dev/null
+++ b/comfyui_controlnet_aux/src/custom_detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
@@ -0,0 +1,443 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include
+#include
+#include
+#include
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template
+__device__ T bilinear_interpolate(
+ const T* input,
+ const int height,
+ const int width,
+ T y,
+ T x) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ return 0;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+
+ if (x < 0) {
+ x = 0;
+ }
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ // do bilinear interpolation
+ T v1 = input[y_low * width + x_low];
+ T v2 = input[y_low * width + x_high];
+ T v3 = input[y_high * width + x_low];
+ T v4 = input[y_high * width + x_high];
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ return val;
+}
+
+template
+__device__ void bilinear_interpolate_gradient(
+ const int height,
+ const int width,
+ T y,
+ T x,
+ T& w1,
+ T& w2,
+ T& w3,
+ T& w4,
+ int& x_low,
+ int& x_high,
+ int& y_low,
+ int& y_high) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+
+ if (x < 0) {
+ x = 0;
+ }
+
+ y_low = (int)y;
+ x_low = (int)x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = input[y_low * width + x_low];
+ // T v2 = input[y_low * width + x_high];
+ // T v3 = input[y_high * width + x_low];
+ // T v4 = input[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+
+} // namespace
+
+template
+__global__ void RoIAlignRotatedForward(
+ const int nthreads,
+ const T* input,
+ const T spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ const T* rois,
+ T* top_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ const T* offset_input =
+ input + (roi_batch_ind * channels + c) * height * width;
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ // We do average (inte gral) pooling inside a bin
+ const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+ {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+ T val = bilinear_interpolate(offset_input, height, width, y, x);
+ output_val += val;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ }
+}
+
+template
+__global__ void RoIAlignRotatedBackwardFeature(
+ const int nthreads,
+ const T* top_diff,
+ const int num_rois,
+ const T spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ T* bottom_diff,
+ const T* rois) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T* offset_bottom_diff =
+ bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T* offset_top_diff = top_diff + top_offset;
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+ {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(
+ height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+ T g1 = top_diff_this_bin * w1 / count;
+ T g2 = top_diff_this_bin * w2 / count;
+ T g3 = top_diff_this_bin * w3 / count;
+ T g4 = top_diff_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+ atomicAdd(
+ offset_bottom_diff + y_low * width + x_low, static_cast(g1));
+ atomicAdd(
+ offset_bottom_diff + y_low * width + x_high, static_cast(g2));
+ atomicAdd(
+ offset_bottom_diff + y_high * width + x_low, static_cast(g3));
+ atomicAdd(
+ offset_bottom_diff + y_high * width + x_high, static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+ at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+ at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+ at::checkAllSameGPU(c, {input_t, rois_t});
+ at::checkAllSameType(c, {input_t, rois_t});
+ at::cuda::CUDAGuard device_guard(input.device());
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty(
+ {num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(
+ at::cuda::ATenCeilDiv(
+ static_cast(output_size), static_cast(512)),
+ static_cast(4096)));
+ dim3 block(512);
+
+ if (output.numel() == 0) {
+ AT_CUDA_CHECK(cudaGetLastError());
+ return output;
+ }
+
+ auto input_ = input.contiguous(), rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES(
+ input.scalar_type(), "ROIAlignRotated_forward", [&] {
+ RoIAlignRotatedForward<<>>(
+ output_size,
+ input_.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois_.data_ptr(),
+ output.data_ptr());
+ });
+ cudaDeviceSynchronize();
+ AT_CUDA_CHECK(cudaGetLastError());
+ return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+ at::CheckedFrom c = "ROIAlign_backward_cuda";
+ at::checkAllSameGPU(c, {grad_t, rois_t});
+ at::checkAllSameType(c, {grad_t, rois_t});
+ at::cuda::CUDAGuard device_guard(grad.device());
+
+ auto num_rois = rois.size(0);
+ auto grad_input =
+ at::zeros({batch_size, channels, height, width}, grad.options());
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(
+ at::cuda::ATenCeilDiv(
+ static_cast(grad.numel()), static_cast(512)),
+ static_cast(4096)));
+ dim3 block(512);
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ AT_CUDA_CHECK(cudaGetLastError());
+ return grad_input;
+ }
+
+ auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES(
+ grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+ RoIAlignRotatedBackwardFeature<<>>(
+ grad.numel(),
+ grad_.data_ptr