object-to-object-replace

Running

App Files Files Community

nikunjkdtechnoland commited on Mar 6, 2024

Commit

4b98c85

1 Parent(s): e041d7d

some more add more files

Browse files

Files changed (18) hide show

iopaint/file_manager/utils.py +65 -0
iopaint/model/anytext/ldm/modules/diffusionmodules/upscaling.py +81 -0
iopaint/model/anytext/ldm/modules/diffusionmodules/util.py +271 -0
iopaint/model/anytext/ldm/util.py +197 -0
iopaint/model/anytext/utils.py +151 -0
iopaint/model/original_sd_configs/v1-inference.yaml +70 -0
iopaint/model/original_sd_configs/v2-inference-v.yaml +68 -0
iopaint/model/utils.py +1033 -0
iopaint/model/zits.py +476 -0
iopaint/plugins/segment_anything/modeling/tiny_vit_sam.py +822 -0
iopaint/plugins/segment_anything/modeling/transformer.py +240 -0
iopaint/plugins/segment_anything/utils/transforms.py +112 -0
iopaint/tests/test_sdxl.py +172 -0
iopaint/tests/utils.py +77 -0
iopaint/web_config.py +307 -0
pretrained-model/version.txt +1 -0
pretrained-model/version_diffusers_cache.txt +1 -0
utils/tools.py +505 -0

iopaint/file_manager/utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copy from: https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/utils.py
+import hashlib
+from pathlib import Path
+from typing import Union
+def generate_filename(directory: Path, original_filename, *options) -> str:
+    text = str(directory.absolute()) + original_filename
+    for v in options:
+        text += "%s" % v
+    md5_hash = hashlib.md5()
+    md5_hash.update(text.encode("utf-8"))
+    return md5_hash.hexdigest() + ".jpg"
+def parse_size(size):
+    if isinstance(size, int):
+        # If the size parameter is a single number, assume square aspect.
+        return [size, size]
+    if isinstance(size, (tuple, list)):
+        if len(size) == 1:
+            # If single value tuple/list is provided, exand it to two elements
+            return size + type(size)(size)
+        return size
+    try:
+        thumbnail_size = [int(x) for x in size.lower().split("x", 1)]
+    except ValueError:
+        raise ValueError(  # pylint: disable=raise-missing-from
+            "Bad thumbnail size format. Valid format is INTxINT."
+        )
+    if len(thumbnail_size) == 1:
+        # If the size parameter only contains a single integer, assume square aspect.
+        thumbnail_size.append(thumbnail_size[0])
+    return thumbnail_size
+def aspect_to_string(size):
+    if isinstance(size, str):
+        return size
+    return "x".join(map(str, size))
+IMG_SUFFIX = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"}
+def glob_img(p: Union[Path, str], recursive: bool = False):
+    p = Path(p)
+    if p.is_file() and p.suffix in IMG_SUFFIX:
+        yield p
+    else:
+        if recursive:
+            files = Path(p).glob("**/*.*")
+        else:
+            files = Path(p).glob("*.*")
+        for it in files:
+            if it.suffix not in IMG_SUFFIX:
+                continue
+            yield it

iopaint/model/anytext/ldm/modules/diffusionmodules/upscaling.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from functools import partial
+from iopaint.model.anytext.ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule
+from iopaint.model.anytext.ldm.util import default
+class AbstractLowScaleModel(nn.Module):
+    # for concatenating a downsampled image to the latent representation
+    def __init__(self, noise_schedule_config=None):
+        super(AbstractLowScaleModel, self).__init__()
+        if noise_schedule_config is not None:
+            self.register_schedule(**noise_schedule_config)
+    def register_schedule(self, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+        betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
+                                   cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+    def forward(self, x):
+        return x, None
+    def decode(self, x):
+        return x
+class SimpleImageConcat(AbstractLowScaleModel):
+    # no noise level conditioning
+    def __init__(self):
+        super(SimpleImageConcat, self).__init__(noise_schedule_config=None)
+        self.max_noise_level = 0
+    def forward(self, x):
+        # fix to constant noise level
+        return x, torch.zeros(x.shape[0], device=x.device).long()
+class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel):
+    def __init__(self, noise_schedule_config, max_noise_level=1000, to_cuda=False):
+        super().__init__(noise_schedule_config=noise_schedule_config)
+        self.max_noise_level = max_noise_level
+    def forward(self, x, noise_level=None):
+        if noise_level is None:
+            noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
+        else:
+            assert isinstance(noise_level, torch.Tensor)
+        z = self.q_sample(x, noise_level)
+        return z, noise_level

iopaint/model/anytext/ldm/modules/diffusionmodules/util.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+import os
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import repeat
+from iopaint.model.anytext.ldm.util import instantiate_from_config
+def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+    if schedule == "linear":
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = (
+                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == "sqrt":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
+    if ddim_discr_method == 'uniform':
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == 'quad':
+        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
+    else:
+        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f'Selected timesteps for ddim sampler: {steps_out}')
+    return steps_out
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    if verbose:
+        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
+        print(f'For the chosen value of eta, which is {eta}, '
+              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
+    return sigmas.to(torch.float32), alphas.to(torch.float32), alphas_prev.astype(np.float32)
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
+                                   "dtype": torch.get_autocast_gpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad(), \
+                torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        # return super().forward(x.float()).type(x.dtype)
+        return super().forward(x).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class HybridConditioner(nn.Module):
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()

iopaint/model/anytext/ldm/util.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import importlib
+import torch
+from torch import optim
+import numpy as np
+from inspect import isfunction
+from PIL import Image, ImageDraw, ImageFont
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('font/Arial_Unicode.ttf', size=size)
+        nc = int(32 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x,torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def instantiate_from_config(config, **kwargs):
+    if "target" not in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
+                 weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
+                 ema_power=1., param_names=()):
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
+                        ema_power=ema_power, param_names=param_names)
+        super().__init__(params, defaults)
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+            optim._functional.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=amsgrad,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    eps=group['eps'],
+                    maximize=False)
+            cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
+            for param, ema_param in zip(params_with_grad, ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
+        return loss

iopaint/model/anytext/utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import datetime
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw
+def save_images(img_list, folder):
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    now = datetime.datetime.now()
+    date_str = now.strftime("%Y-%m-%d")
+    folder_path = os.path.join(folder, date_str)
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path)
+    time_str = now.strftime("%H_%M_%S")
+    for idx, img in enumerate(img_list):
+        image_number = idx + 1
+        filename = f"{time_str}_{image_number}.jpg"
+        save_path = os.path.join(folder_path, filename)
+        cv2.imwrite(save_path, img[..., ::-1])
+def check_channels(image):
+    channels = image.shape[2] if len(image.shape) == 3 else 1
+    if channels == 1:
+        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+    elif channels > 3:
+        image = image[:, :, :3]
+    return image
+def resize_image(img, max_length=768):
+    height, width = img.shape[:2]
+    max_dimension = max(height, width)
+    if max_dimension > max_length:
+        scale_factor = max_length / max_dimension
+        new_width = int(round(width * scale_factor))
+        new_height = int(round(height * scale_factor))
+        new_size = (new_width, new_height)
+        img = cv2.resize(img, new_size)
+    height, width = img.shape[:2]
+    img = cv2.resize(img, (width - (width % 64), height - (height % 64)))
+    return img
+def insert_spaces(string, nSpace):
+    if nSpace == 0:
+        return string
+    new_string = ""
+    for char in string:
+        new_string += char + " " * nSpace
+    return new_string[:-nSpace]
+def draw_glyph(font, text):
+    g_size = 50
+    W, H = (512, 80)
+    new_font = font.font_variant(size=g_size)
+    img = Image.new(mode="1", size=(W, H), color=0)
+    draw = ImageDraw.Draw(img)
+    left, top, right, bottom = new_font.getbbox(text)
+    text_width = max(right - left, 5)
+    text_height = max(bottom - top, 5)
+    ratio = min(W * 0.9 / text_width, H * 0.9 / text_height)
+    new_font = font.font_variant(size=int(g_size * ratio))
+    text_width, text_height = new_font.getsize(text)
+    offset_x, offset_y = new_font.getoffset(text)
+    x = (img.width - text_width) // 2
+    y = (img.height - text_height) // 2 - offset_y // 2
+    draw.text((x, y), text, font=new_font, fill="white")
+    img = np.expand_dims(np.array(img), axis=2).astype(np.float64)
+    return img
+def draw_glyph2(
+    font, text, polygon, vertAng=10, scale=1, width=512, height=512, add_space=True
+):
+    enlarge_polygon = polygon * scale
+    rect = cv2.minAreaRect(enlarge_polygon)
+    box = cv2.boxPoints(rect)
+    box = np.int0(box)
+    w, h = rect[1]
+    angle = rect[2]
+    if angle < -45:
+        angle += 90
+    angle = -angle
+    if w < h:
+        angle += 90
+    vert = False
+    if abs(angle) % 90 < vertAng or abs(90 - abs(angle) % 90) % 90 < vertAng:
+        _w = max(box[:, 0]) - min(box[:, 0])
+        _h = max(box[:, 1]) - min(box[:, 1])
+        if _h >= _w:
+            vert = True
+            angle = 0
+    img = np.zeros((height * scale, width * scale, 3), np.uint8)
+    img = Image.fromarray(img)
+    # infer font size
+    image4ratio = Image.new("RGB", img.size, "white")
+    draw = ImageDraw.Draw(image4ratio)
+    _, _, _tw, _th = draw.textbbox(xy=(0, 0), text=text, font=font)
+    text_w = min(w, h) * (_tw / _th)
+    if text_w <= max(w, h):
+        # add space
+        if len(text) > 1 and not vert and add_space:
+            for i in range(1, 100):
+                text_space = insert_spaces(text, i)
+                _, _, _tw2, _th2 = draw.textbbox(xy=(0, 0), text=text_space, font=font)
+                if min(w, h) * (_tw2 / _th2) > max(w, h):
+                    break
+            text = insert_spaces(text, i - 1)
+        font_size = min(w, h) * 0.80
+    else:
+        shrink = 0.75 if vert else 0.85
+        font_size = min(w, h) / (text_w / max(w, h)) * shrink
+    new_font = font.font_variant(size=int(font_size))
+    left, top, right, bottom = new_font.getbbox(text)
+    text_width = right - left
+    text_height = bottom - top
+    layer = Image.new("RGBA", img.size, (0, 0, 0, 0))
+    draw = ImageDraw.Draw(layer)
+    if not vert:
+        draw.text(
+            (rect[0][0] - text_width // 2, rect[0][1] - text_height // 2 - top),
+            text,
+            font=new_font,
+            fill=(255, 255, 255, 255),
+        )
+    else:
+        x_s = min(box[:, 0]) + _w // 2 - text_height // 2
+        y_s = min(box[:, 1])
+        for c in text:
+            draw.text((x_s, y_s), c, font=new_font, fill=(255, 255, 255, 255))
+            _, _t, _, _b = new_font.getbbox(c)
+            y_s += _b
+    rotated_layer = layer.rotate(angle, expand=1, center=(rect[0][0], rect[0][1]))
+    x_offset = int((img.width - rotated_layer.width) / 2)
+    y_offset = int((img.height - rotated_layer.height) / 2)
+    img.paste(rotated_layer, (x_offset, y_offset), rotated_layer)
+    img = np.expand_dims(np.array(img.convert("1")), axis=2).astype(np.float64)
+    return img

iopaint/model/original_sd_configs/v1-inference.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

iopaint/model/original_sd_configs/v2-inference-v.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

iopaint/model/utils.py ADDED Viewed

	@@ -0,0 +1,1033 @@

+import gc
+import math
+import random
+import traceback
+from typing import Any
+import torch
+import numpy as np
+import collections
+from itertools import repeat
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    UniPCMultistepScheduler,
+    LCMScheduler,
+    DPMSolverSinglestepScheduler,
+    KDPM2DiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    HeunDiscreteScheduler,
+)
+from loguru import logger
+from iopaint.schema import SDSampler
+from torch import conv2d, conv_transpose2d
+def make_beta_schedule(
+    device, schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        ).to(device)
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2).to(device)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64
+        )
+    elif schedule == "sqrt":
+        betas = (
+            torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+            ** 0.5
+        )
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt(
+        (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)
+    )
+    if verbose:
+        print(
+            f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}"
+        )
+        print(
+            f"For the chosen value of eta, which is {eta}, "
+            f"this results in the following sigma_t schedule for ddim sampler {sigmas}"
+        )
+    return sigmas, alphas, alphas_prev
+def make_ddim_timesteps(
+    ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True
+):
+    if ddim_discr_method == "uniform":
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == "quad":
+        ddim_timesteps = (
+            (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps)) ** 2
+        ).astype(int)
+    else:
+        raise NotImplementedError(
+            f'There is no ddim discretization method called "{ddim_discr_method}"'
+        )
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f"Selected timesteps for ddim sampler: {steps_out}")
+    return steps_out
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
+        shape[0], *((1,) * (len(shape) - 1))
+    )
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def timestep_embedding(device, timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+###### MAT and FcF #######
+def normalize_2nd_moment(x, dim=1):
+    return (
+        x * (x.square().mean(dim=dim, keepdim=True) + torch.finfo(x.dtype).eps).rsqrt()
+    )
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+def _bias_act_ref(x, b=None, dim=1, act="linear", alpha=None, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()` using standard TensorFlow ops."""
+    assert isinstance(x, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+    # Add bias.
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.ndim == 1
+        assert 0 <= dim < x.ndim
+        assert b.shape[0] == x.shape[dim]
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+    # Evaluate activation function.
+    alpha = float(alpha)
+    x = spec.func(x, alpha=alpha)
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+    # Clamp.
+    if clamp >= 0:
+        x = x.clamp(-clamp, clamp)  # pylint: disable=invalid-unary-operand-type
+    return x
+def bias_act(
+    x, b=None, dim=1, act="linear", alpha=None, gain=None, clamp=None, impl="ref"
+):
+    r"""Fused bias and activation function.
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports first and second order gradients,
+    but not third order gradients.
+    Args:
+        x:      Input activation tensor. Can be of any shape.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `dim`.
+        dim:    The dimension in `x` corresponding to the elements of `b`.
+                The value of `dim` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying 1.
+        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
+                the clamping (default).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ["ref", "cuda"]
+    return _bias_act_ref(
+        x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp
+    )
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    fw = f.shape[-1]
+    fh = f.shape[0]
+    fw = int(fw)
+    fh = int(fh)
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+def _get_weight_shape(w):
+    shape = [int(sz) for sz in w.shape]
+    return shape
+def _parse_scaling(scaling):
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+def setup_filter(
+    f,
+    device=torch.device("cpu"),
+    normalize=True,
+    flip_filter=False,
+    gain=1,
+    separable=None,
+):
+    r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`.
+    Args:
+        f:           Torch tensor, numpy array, or python list of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable),
+                     `[]` (impulse), or
+                     `None` (identity).
+        device:      Result device (default: cpu).
+        normalize:   Normalize the filter so that it retains the magnitude
+                     for constant input signal (DC)? (default: True).
+        flip_filter: Flip the filter? (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        separable:   Return a separable filter? (default: select automatically).
+    Returns:
+        Float32 tensor of the shape
+        `[filter_height, filter_width]` (non-separable) or
+        `[filter_taps]` (separable).
+    """
+    # Validate.
+    if f is None:
+        f = 1
+    f = torch.as_tensor(f, dtype=torch.float32)
+    assert f.ndim in [0, 1, 2]
+    assert f.numel() > 0
+    if f.ndim == 0:
+        f = f[np.newaxis]
+    # Separable?
+    if separable is None:
+        separable = f.ndim == 1 and f.numel() >= 8
+    if f.ndim == 1 and not separable:
+        f = f.ger(f)
+    assert f.ndim == (1 if separable else 2)
+    # Apply normalize, flip, gain, and device.
+    if normalize:
+        f /= f.sum()
+    if flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(device=device)
+    return f
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+activation_funcs = {
+    "linear": EasyDict(
+        func=lambda x, **_: x,
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=1,
+        ref="",
+        has_2nd_grad=False,
+    ),
+    "relu": EasyDict(
+        func=lambda x, **_: torch.nn.functional.relu(x),
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=2,
+        ref="y",
+        has_2nd_grad=False,
+    ),
+    "lrelu": EasyDict(
+        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
+        def_alpha=0.2,
+        def_gain=np.sqrt(2),
+        cuda_idx=3,
+        ref="y",
+        has_2nd_grad=False,
+    ),
+    "tanh": EasyDict(
+        func=lambda x, **_: torch.tanh(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=4,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "sigmoid": EasyDict(
+        func=lambda x, **_: torch.sigmoid(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=5,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "elu": EasyDict(
+        func=lambda x, **_: torch.nn.functional.elu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=6,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "selu": EasyDict(
+        func=lambda x, **_: torch.nn.functional.selu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=7,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "softplus": EasyDict(
+        func=lambda x, **_: torch.nn.functional.softplus(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=8,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "swish": EasyDict(
+        func=lambda x, **_: torch.sigmoid(x) * x,
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=9,
+        ref="x",
+        has_2nd_grad=True,
+    ),
+}
+def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    r"""Pad, upsample, filter, and downsample a batch of 2D images.
+    Performs the following sequence of operations for each channel:
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+    2. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+    3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+    4. Downsample the image by keeping every Nth pixel (`down`).
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # assert isinstance(x, torch.Tensor)
+    # assert impl in ['ref', 'cuda']
+    return _upfirdn2d_ref(
+        x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain
+    )
+def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops."""
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    if f is None:
+        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    assert not f.requires_grad
+    batch_size, num_channels, in_height, in_width = x.shape
+    # upx, upy = _parse_scaling(up)
+    # downx, downy = _parse_scaling(down)
+    upx, upy = up, up
+    downx, downy = down, down
+    # padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    padx0, padx1, pady0, pady1 = padding[0], padding[1], padding[2], padding[3]
+    # Upsample by inserting zeros.
+    x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+    # Pad or crop.
+    x = torch.nn.functional.pad(
+        x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)]
+    )
+    x = x[
+        :,
+        :,
+        max(-pady0, 0) : x.shape[2] - max(-pady1, 0),
+        max(-padx0, 0) : x.shape[3] - max(-padx1, 0),
+    ]
+    # Setup filter.
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(x.dtype)
+    if not flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    # Convolve with the filter.
+    f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim)
+    if f.ndim == 4:
+        x = conv2d(input=x, weight=f, groups=num_channels)
+    else:
+        x = conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels)
+        x = conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels)
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    r"""Downsample a batch of 2D images using the given 2D FIR filter.
+    By default, the result is padded so that its shape is a fraction of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the input. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    # padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    padx0, padx1, pady0, pady1 = padding, padding, padding, padding
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(
+        x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl
+    )
+def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    r"""Upsample a batch of 2D images using the given 2D FIR filter.
+    By default, the result is padded so that its shape is a multiple of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    upx, upy = _parse_scaling(up)
+    # upx, upy = up, up
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    # padx0, padx1, pady0, pady1 = padding, padding, padding, padding
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(
+        x,
+        f,
+        up=up,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain * upx * upy,
+        impl=impl,
+    )
+class MinibatchStdLayer(torch.nn.Module):
+    def __init__(self, group_size, num_channels=1):
+        super().__init__()
+        self.group_size = group_size
+        self.num_channels = num_channels
+    def forward(self, x):
+        N, C, H, W = x.shape
+        G = (
+            torch.min(torch.as_tensor(self.group_size), torch.as_tensor(N))
+            if self.group_size is not None
+            else N
+        )
+        F = self.num_channels
+        c = C // F
+        y = x.reshape(
+            G, -1, F, c, H, W
+        )  # [GnFcHW] Split minibatch N into n groups of size G, and channels C into F groups of size c.
+        y = y - y.mean(dim=0)  # [GnFcHW] Subtract mean over group.
+        y = y.square().mean(dim=0)  # [nFcHW]  Calc variance over group.
+        y = (y + 1e-8).sqrt()  # [nFcHW]  Calc stddev over group.
+        y = y.mean(dim=[2, 3, 4])  # [nF]     Take average over channels and pixels.
+        y = y.reshape(-1, F, 1, 1)  # [nF11]   Add missing dimensions.
+        y = y.repeat(G, 1, H, W)  # [NFHW]   Replicate over group and pixels.
+        x = torch.cat([x, y], dim=1)  # [NCHW]   Append to input as new channels.
+        return x
+class FullyConnectedLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_features,  # Number of input features.
+        out_features,  # Number of output features.
+        bias=True,  # Apply additive bias before the activation function?
+        activation="linear",  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier=1,  # Learning rate multiplier.
+        bias_init=0,  # Initial value for the additive bias.
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(
+            torch.randn([out_features, in_features]) / lr_multiplier
+        )
+        self.bias = (
+            torch.nn.Parameter(torch.full([out_features], np.float32(bias_init)))
+            if bias
+            else None
+        )
+        self.activation = activation
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+    def forward(self, x):
+        w = self.weight * self.weight_gain
+        b = self.bias
+        if b is not None and self.bias_gain != 1:
+            b = b * self.bias_gain
+        if self.activation == "linear" and b is not None:
+            # out = torch.addmm(b.unsqueeze(0), x, w.t())
+            x = x.matmul(w.t())
+            out = x + b.reshape([-1 if i == x.ndim - 1 else 1 for i in range(x.ndim)])
+        else:
+            x = x.matmul(w.t())
+            out = bias_act(x, b, act=self.activation, dim=x.ndim - 1)
+        return out
+def _conv2d_wrapper(
+    x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True
+):
+    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations."""
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    # Flip weight if requested.
+    if (
+        not flip_weight
+    ):  # conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
+        w = w.flip([2, 3])
+    # Workaround performance pitfall in cuDNN 8.0.5, triggered when using
+    # 1x1 kernel + memory_format=channels_last + less than 64 channels.
+    if (
+        kw == 1
+        and kh == 1
+        and stride == 1
+        and padding in [0, [0, 0], (0, 0)]
+        and not transpose
+    ):
+        if x.stride()[1] == 1 and min(out_channels, in_channels_per_group) < 64:
+            if out_channels <= 4 and groups == 1:
+                in_shape = x.shape
+                x = w.squeeze(3).squeeze(2) @ x.reshape(
+                    [in_shape[0], in_channels_per_group, -1]
+                )
+                x = x.reshape([in_shape[0], out_channels, in_shape[2], in_shape[3]])
+            else:
+                x = x.to(memory_format=torch.contiguous_format)
+                w = w.to(memory_format=torch.contiguous_format)
+                x = conv2d(x, w, groups=groups)
+            return x.to(memory_format=torch.channels_last)
+    # Otherwise => execute using conv2d_gradfix.
+    op = conv_transpose2d if transpose else conv2d
+    return op(x, w, stride=stride, padding=padding, groups=groups)
+def conv2d_resample(
+    x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False
+):
+    r"""2D convolution with optional up/downsampling.
+    Padding is performed only once at the beginning, not between the operations.
+    Args:
+        x:              Input tensor of shape
+                        `[batch_size, in_channels, in_height, in_width]`.
+        w:              Weight tensor of shape
+                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
+        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
+                        calling setup_filter(). None = identity (default).
+        up:             Integer upsampling factor (default: 1).
+        down:           Integer downsampling factor (default: 1).
+        padding:        Padding with respect to the upsampled image. Can be a single number
+                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                        (default: 0).
+        groups:         Split input channels into N groups (default: 1).
+        flip_weight:    False = convolution, True = correlation (default: True).
+        flip_filter:    False = convolution, True = correlation (default: False).
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
+    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2])
+    assert isinstance(up, int) and (up >= 1)
+    assert isinstance(down, int) and (down >= 1)
+    # assert isinstance(groups, int) and (groups >= 1), f"!!!!!! groups: {groups} isinstance(groups, int)  {isinstance(groups, int)} {type(groups)}"
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    fw, fh = _get_filter_size(f)
+    # px0, px1, py0, py1 = _parse_padding(padding)
+    px0, px1, py0, py1 = padding, padding, padding, padding
+    # Adjust padding to account for up/downsampling.
+    if up > 1:
+        px0 += (fw + up - 1) // 2
+        px1 += (fw - up) // 2
+        py0 += (fh + up - 1) // 2
+        py1 += (fh - up) // 2
+    if down > 1:
+        px0 += (fw - down + 1) // 2
+        px1 += (fw - down) // 2
+        py0 += (fh - down + 1) // 2
+        py1 += (fh - down) // 2
+    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
+    if kw == 1 and kh == 1 and (down > 1 and up == 1):
+        x = upfirdn2d(
+            x=x, f=f, down=down, padding=[px0, px1, py0, py1], flip_filter=flip_filter
+        )
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        return x
+    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
+    if kw == 1 and kh == 1 and (up > 1 and down == 1):
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        x = upfirdn2d(
+            x=x,
+            f=f,
+            up=up,
+            padding=[px0, px1, py0, py1],
+            gain=up**2,
+            flip_filter=flip_filter,
+        )
+        return x
+    # Fast path: downsampling only => use strided convolution.
+    if down > 1 and up == 1:
+        x = upfirdn2d(x=x, f=f, padding=[px0, px1, py0, py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(
+            x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight
+        )
+        return x
+    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
+    if up > 1:
+        if groups == 1:
+            w = w.transpose(0, 1)
+        else:
+            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
+            w = w.transpose(1, 2)
+            w = w.reshape(
+                groups * in_channels_per_group, out_channels // groups, kh, kw
+            )
+        px0 -= kw - 1
+        px1 -= kw - up
+        py0 -= kh - 1
+        py1 -= kh - up
+        pxt = max(min(-px0, -px1), 0)
+        pyt = max(min(-py0, -py1), 0)
+        x = _conv2d_wrapper(
+            x=x,
+            w=w,
+            stride=up,
+            padding=[pyt, pxt],
+            groups=groups,
+            transpose=True,
+            flip_weight=(not flip_weight),
+        )
+        x = upfirdn2d(
+            x=x,
+            f=f,
+            padding=[px0 + pxt, px1 + pxt, py0 + pyt, py1 + pyt],
+            gain=up**2,
+            flip_filter=flip_filter,
+        )
+        if down > 1:
+            x = upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+        return x
+    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
+    if up == 1 and down == 1:
+        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
+            return _conv2d_wrapper(
+                x=x, w=w, padding=[py0, px0], groups=groups, flip_weight=flip_weight
+            )
+    # Fallback: Generic reference implementation.
+    x = upfirdn2d(
+        x=x,
+        f=(f if up > 1 else None),
+        up=up,
+        padding=[px0, px1, py0, py1],
+        gain=up**2,
+        flip_filter=flip_filter,
+    )
+    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+    if down > 1:
+        x = upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+    return x
+class Conv2dLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        out_channels,  # Number of output channels.
+        kernel_size,  # Width and height of the convolution kernel.
+        bias=True,  # Apply additive bias before the activation function?
+        activation="linear",  # Activation function: 'relu', 'lrelu', etc.
+        up=1,  # Integer upsampling factor.
+        down=1,  # Integer downsampling factor.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output to +-X, None = disable clamping.
+        channels_last=False,  # Expect the input to have memory_format=channels_last?
+        trainable=True,  # Update the weights of this layer during training?
+    ):
+        super().__init__()
+        self.activation = activation
+        self.up = up
+        self.down = down
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.conv_clamp = conv_clamp
+        self.padding = kernel_size // 2
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2))
+        self.act_gain = activation_funcs[activation].def_gain
+        memory_format = (
+            torch.channels_last if channels_last else torch.contiguous_format
+        )
+        weight = torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(
+            memory_format=memory_format
+        )
+        bias = torch.zeros([out_channels]) if bias else None
+        if trainable:
+            self.weight = torch.nn.Parameter(weight)
+            self.bias = torch.nn.Parameter(bias) if bias is not None else None
+        else:
+            self.register_buffer("weight", weight)
+            if bias is not None:
+                self.register_buffer("bias", bias)
+            else:
+                self.bias = None
+    def forward(self, x, gain=1):
+        w = self.weight * self.weight_gain
+        x = conv2d_resample(
+            x=x,
+            w=w,
+            f=self.resample_filter,
+            up=self.up,
+            down=self.down,
+            padding=self.padding,
+        )
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        out = bias_act(
+            x, self.bias, act=self.activation, gain=act_gain, clamp=act_clamp
+        )
+        return out
+def torch_gc():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def get_scheduler(sd_sampler, scheduler_config):
+    # https://github.com/huggingface/diffusers/issues/4167
+    keys_to_pop = ["use_karras_sigmas", "algorithm_type"]
+    scheduler_config = dict(scheduler_config)
+    for it in keys_to_pop:
+        scheduler_config.pop(it, None)
+    # fmt: off
+    samplers = {
+        SDSampler.dpm_plus_plus_2m: [DPMSolverMultistepScheduler],
+        SDSampler.dpm_plus_plus_2m_karras: [DPMSolverMultistepScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.dpm_plus_plus_2m_sde: [DPMSolverMultistepScheduler, dict(algorithm_type="sde-dpmsolver++")],
+        SDSampler.dpm_plus_plus_2m_sde_karras: [DPMSolverMultistepScheduler, dict(algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)],
+        SDSampler.dpm_plus_plus_sde: [DPMSolverSinglestepScheduler],
+        SDSampler.dpm_plus_plus_sde_karras: [DPMSolverSinglestepScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.dpm2: [KDPM2DiscreteScheduler],
+        SDSampler.dpm2_karras: [KDPM2DiscreteScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.dpm2_a: [KDPM2AncestralDiscreteScheduler],
+        SDSampler.dpm2_a_karras: [KDPM2AncestralDiscreteScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.euler: [EulerDiscreteScheduler],
+        SDSampler.euler_a: [EulerAncestralDiscreteScheduler],
+        SDSampler.heun: [HeunDiscreteScheduler],
+        SDSampler.lms: [LMSDiscreteScheduler],
+        SDSampler.lms_karras: [LMSDiscreteScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.ddim: [DDIMScheduler],
+        SDSampler.pndm: [PNDMScheduler],
+        SDSampler.uni_pc: [UniPCMultistepScheduler],
+        SDSampler.lcm: [LCMScheduler],
+    }
+    # fmt: on
+    if sd_sampler in samplers:
+        if len(samplers[sd_sampler]) == 2:
+            scheduler_cls, kwargs = samplers[sd_sampler]
+        else:
+            scheduler_cls, kwargs = samplers[sd_sampler][0], {}
+        return scheduler_cls.from_config(scheduler_config, **kwargs)
+    else:
+        raise ValueError(sd_sampler)
+def is_local_files_only(**kwargs) -> bool:
+    from huggingface_hub.constants import HF_HUB_OFFLINE
+    return HF_HUB_OFFLINE or kwargs.get("local_files_only", False)
+def handle_from_pretrained_exceptions(func, **kwargs):
+    try:
+        return func(**kwargs)
+    except ValueError as e:
+        if "You are trying to load the model files of the `variant=fp16`" in str(e):
+            logger.info("variant=fp16 not found, try revision=fp16")
+            try:
+                return func(**{**kwargs, "variant": None, "revision": "fp16"})
+            except Exception as e:
+                logger.info("revision=fp16 not found, try revision=main")
+                return func(**{**kwargs, "variant": None, "revision": "main"})
+        raise e
+    except OSError as e:
+        previous_traceback = traceback.format_exc()
+        if "RevisionNotFoundError: 404 Client Error." in previous_traceback:
+            logger.info("revision=fp16 not found, try revision=main")
+            return func(**{**kwargs, "variant": None, "revision": "main"})
+        elif "Max retries exceeded" in previous_traceback:
+            logger.exception(
+                "Fetching model from HuggingFace failed. "
+                "If this is your first time downloading the model, you may need to set up proxy in terminal."
+                "If the model has already been downloaded, you can add --local-files-only when starting."
+            )
+            exit(-1)
+        raise e
+    except Exception as e:
+        raise e
+def get_torch_dtype(device, no_half: bool):
+    device = str(device)
+    use_fp16 = not no_half
+    use_gpu = device == "cuda"
+    # https://github.com/huggingface/diffusers/issues/4480
+    # pipe.enable_attention_slicing and float16 will cause black output on mps
+    # if device in ["cuda", "mps"] and use_fp16:
+    if device in ["cuda"] and use_fp16:
+        return use_gpu, torch.float16
+    return use_gpu, torch.float32
+def enable_low_mem(pipe, enable: bool):
+    if torch.backends.mps.is_available():
+        # https://huggingface.co/docs/diffusers/v0.25.0/en/api/pipelines/stable_diffusion/image_variation#diffusers.StableDiffusionImageVariationPipeline.enable_attention_slicing
+        # CUDA: Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA) from PyTorch 2.0 or xFormers.
+        if enable:
+            pipe.enable_attention_slicing("max")
+        else:
+            # https://huggingface.co/docs/diffusers/optimization/mps
+            # Devices with less than 64GB of memory are recommended to use enable_attention_slicing
+            pipe.enable_attention_slicing()
+    if enable:
+        pipe.vae.enable_tiling()

iopaint/model/zits.py ADDED Viewed

	@@ -0,0 +1,476 @@

+import os
+import time
+import cv2
+import torch
+import torch.nn.functional as F
+from iopaint.helper import get_cache_path_by_url, load_jit_model, download_model
+from iopaint.schema import InpaintRequest
+import numpy as np
+from .base import InpaintModel
+ZITS_INPAINT_MODEL_URL = os.environ.get(
+    "ZITS_INPAINT_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-inpaint-0717.pt",
+)
+ZITS_INPAINT_MODEL_MD5 = os.environ.get(
+    "ZITS_INPAINT_MODEL_MD5", "9978cc7157dc29699e42308d675b2154"
+)
+ZITS_EDGE_LINE_MODEL_URL = os.environ.get(
+    "ZITS_EDGE_LINE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-edge-line-0717.pt",
+)
+ZITS_EDGE_LINE_MODEL_MD5 = os.environ.get(
+    "ZITS_EDGE_LINE_MODEL_MD5", "55e31af21ba96bbf0c80603c76ea8c5f"
+)
+ZITS_STRUCTURE_UPSAMPLE_MODEL_URL = os.environ.get(
+    "ZITS_STRUCTURE_UPSAMPLE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-structure-upsample-0717.pt",
+)
+ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5 = os.environ.get(
+    "ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5", "3d88a07211bd41b2ec8cc0d999f29927"
+)
+ZITS_WIRE_FRAME_MODEL_URL = os.environ.get(
+    "ZITS_WIRE_FRAME_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-wireframe-0717.pt",
+)
+ZITS_WIRE_FRAME_MODEL_MD5 = os.environ.get(
+    "ZITS_WIRE_FRAME_MODEL_MD5", "a9727c63a8b48b65c905d351b21ce46b"
+)
+def resize(img, height, width, center_crop=False):
+    imgh, imgw = img.shape[0:2]
+    if center_crop and imgh != imgw:
+        # center crop
+        side = np.minimum(imgh, imgw)
+        j = (imgh - side) // 2
+        i = (imgw - side) // 2
+        img = img[j : j + side, i : i + side, ...]
+    if imgh > height and imgw > width:
+        inter = cv2.INTER_AREA
+    else:
+        inter = cv2.INTER_LINEAR
+    img = cv2.resize(img, (height, width), interpolation=inter)
+    return img
+def to_tensor(img, scale=True, norm=False):
+    if img.ndim == 2:
+        img = img[:, :, np.newaxis]
+    c = img.shape[-1]
+    if scale:
+        img_t = torch.from_numpy(img).permute(2, 0, 1).float().div(255)
+    else:
+        img_t = torch.from_numpy(img).permute(2, 0, 1).float()
+    if norm:
+        mean = torch.tensor([0.5, 0.5, 0.5]).reshape(c, 1, 1)
+        std = torch.tensor([0.5, 0.5, 0.5]).reshape(c, 1, 1)
+        img_t = (img_t - mean) / std
+    return img_t
+def load_masked_position_encoding(mask):
+    ones_filter = np.ones((3, 3), dtype=np.float32)
+    d_filter1 = np.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]], dtype=np.float32)
+    d_filter2 = np.array([[0, 0, 0], [1, 1, 0], [1, 1, 0]], dtype=np.float32)
+    d_filter3 = np.array([[0, 1, 1], [0, 1, 1], [0, 0, 0]], dtype=np.float32)
+    d_filter4 = np.array([[0, 0, 0], [0, 1, 1], [0, 1, 1]], dtype=np.float32)
+    str_size = 256
+    pos_num = 128
+    ori_mask = mask.copy()
+    ori_h, ori_w = ori_mask.shape[0:2]
+    ori_mask = ori_mask / 255
+    mask = cv2.resize(mask, (str_size, str_size), interpolation=cv2.INTER_AREA)
+    mask[mask > 0] = 255
+    h, w = mask.shape[0:2]
+    mask3 = mask.copy()
+    mask3 = 1.0 - (mask3 / 255.0)
+    pos = np.zeros((h, w), dtype=np.int32)
+    direct = np.zeros((h, w, 4), dtype=np.int32)
+    i = 0
+    while np.sum(1 - mask3) > 0:
+        i += 1
+        mask3_ = cv2.filter2D(mask3, -1, ones_filter)
+        mask3_[mask3_ > 0] = 1
+        sub_mask = mask3_ - mask3
+        pos[sub_mask == 1] = i
+        m = cv2.filter2D(mask3, -1, d_filter1)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 0] = 1
+        m = cv2.filter2D(mask3, -1, d_filter2)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 1] = 1
+        m = cv2.filter2D(mask3, -1, d_filter3)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 2] = 1
+        m = cv2.filter2D(mask3, -1, d_filter4)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 3] = 1
+        mask3 = mask3_
+    abs_pos = pos.copy()
+    rel_pos = pos / (str_size / 2)  # to 0~1 maybe larger than 1
+    rel_pos = (rel_pos * pos_num).astype(np.int32)
+    rel_pos = np.clip(rel_pos, 0, pos_num - 1)
+    if ori_w != w or ori_h != h:
+        rel_pos = cv2.resize(rel_pos, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+        rel_pos[ori_mask == 0] = 0
+        direct = cv2.resize(direct, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+        direct[ori_mask == 0, :] = 0
+    return rel_pos, abs_pos, direct
+def load_image(img, mask, device, sigma256=3.0):
+    """
+    Args:
+        img: [H, W, C] RGB
+        mask: [H, W] 255 为 masks 区域
+        sigma256:
+    Returns:
+    """
+    h, w, _ = img.shape
+    imgh, imgw = img.shape[0:2]
+    img_256 = resize(img, 256, 256)
+    mask = (mask > 127).astype(np.uint8) * 255
+    mask_256 = cv2.resize(mask, (256, 256), interpolation=cv2.INTER_AREA)
+    mask_256[mask_256 > 0] = 255
+    mask_512 = cv2.resize(mask, (512, 512), interpolation=cv2.INTER_AREA)
+    mask_512[mask_512 > 0] = 255
+    # original skimage implemention
+    # https://scikit-image.org/docs/stable/api/skimage.feature.html#skimage.feature.canny
+    # low_threshold: Lower bound for hysteresis thresholding (linking edges). If None, low_threshold is set to 10% of dtype’s max.
+    # high_threshold: Upper bound for hysteresis thresholding (linking edges). If None, high_threshold is set to 20% of dtype’s max.
+    try:
+        import skimage
+        gray_256 = skimage.color.rgb2gray(img_256)
+        edge_256 = skimage.feature.canny(gray_256, sigma=3.0, mask=None).astype(float)
+        # cv2.imwrite("skimage_gray.jpg", (gray_256*255).astype(np.uint8))
+        # cv2.imwrite("skimage_edge.jpg", (edge_256*255).astype(np.uint8))
+    except:
+        gray_256 = cv2.cvtColor(img_256, cv2.COLOR_RGB2GRAY)
+        gray_256_blured = cv2.GaussianBlur(
+            gray_256, ksize=(7, 7), sigmaX=sigma256, sigmaY=sigma256
+        )
+        edge_256 = cv2.Canny(
+            gray_256_blured, threshold1=int(255 * 0.1), threshold2=int(255 * 0.2)
+        )
+    # cv2.imwrite("opencv_edge.jpg", edge_256)
+    # line
+    img_512 = resize(img, 512, 512)
+    rel_pos, abs_pos, direct = load_masked_position_encoding(mask)
+    batch = dict()
+    batch["images"] = to_tensor(img.copy()).unsqueeze(0).to(device)
+    batch["img_256"] = to_tensor(img_256, norm=True).unsqueeze(0).to(device)
+    batch["masks"] = to_tensor(mask).unsqueeze(0).to(device)
+    batch["mask_256"] = to_tensor(mask_256).unsqueeze(0).to(device)
+    batch["mask_512"] = to_tensor(mask_512).unsqueeze(0).to(device)
+    batch["edge_256"] = to_tensor(edge_256, scale=False).unsqueeze(0).to(device)
+    batch["img_512"] = to_tensor(img_512).unsqueeze(0).to(device)
+    batch["rel_pos"] = torch.LongTensor(rel_pos).unsqueeze(0).to(device)
+    batch["abs_pos"] = torch.LongTensor(abs_pos).unsqueeze(0).to(device)
+    batch["direct"] = torch.LongTensor(direct).unsqueeze(0).to(device)
+    batch["h"] = imgh
+    batch["w"] = imgw
+    return batch
+def to_device(data, device):
+    if isinstance(data, torch.Tensor):
+        return data.to(device)
+    if isinstance(data, dict):
+        for key in data:
+            if isinstance(data[key], torch.Tensor):
+                data[key] = data[key].to(device)
+        return data
+    if isinstance(data, list):
+        return [to_device(d, device) for d in data]
+class ZITS(InpaintModel):
+    name = "zits"
+    min_size = 256
+    pad_mod = 32
+    pad_to_square = True
+    is_erase_model = True
+    def __init__(self, device, **kwargs):
+        """
+        Args:
+            device:
+        """
+        super().__init__(device)
+        self.device = device
+        self.sample_edge_line_iterations = 1
+    def init_model(self, device, **kwargs):
+        self.wireframe = load_jit_model(
+            ZITS_WIRE_FRAME_MODEL_URL, device, ZITS_WIRE_FRAME_MODEL_MD5
+        )
+        self.edge_line = load_jit_model(
+            ZITS_EDGE_LINE_MODEL_URL, device, ZITS_EDGE_LINE_MODEL_MD5
+        )
+        self.structure_upsample = load_jit_model(
+            ZITS_STRUCTURE_UPSAMPLE_MODEL_URL, device, ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5
+        )
+        self.inpaint = load_jit_model(
+            ZITS_INPAINT_MODEL_URL, device, ZITS_INPAINT_MODEL_MD5
+        )
+    @staticmethod
+    def download():
+        download_model(ZITS_WIRE_FRAME_MODEL_URL, ZITS_WIRE_FRAME_MODEL_MD5)
+        download_model(ZITS_EDGE_LINE_MODEL_URL, ZITS_EDGE_LINE_MODEL_MD5)
+        download_model(
+            ZITS_STRUCTURE_UPSAMPLE_MODEL_URL, ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5
+        )
+        download_model(ZITS_INPAINT_MODEL_URL, ZITS_INPAINT_MODEL_MD5)
+    @staticmethod
+    def is_downloaded() -> bool:
+        model_paths = [
+            get_cache_path_by_url(ZITS_WIRE_FRAME_MODEL_URL),
+            get_cache_path_by_url(ZITS_EDGE_LINE_MODEL_URL),
+            get_cache_path_by_url(ZITS_STRUCTURE_UPSAMPLE_MODEL_URL),
+            get_cache_path_by_url(ZITS_INPAINT_MODEL_URL),
+        ]
+        return all([os.path.exists(it) for it in model_paths])
+    def wireframe_edge_and_line(self, items, enable: bool):
+        # 最终向 items 中添加 edge 和 line key
+        if not enable:
+            items["edge"] = torch.zeros_like(items["masks"])
+            items["line"] = torch.zeros_like(items["masks"])
+            return
+        start = time.time()
+        try:
+            line_256 = self.wireframe_forward(
+                items["img_512"],
+                h=256,
+                w=256,
+                masks=items["mask_512"],
+                mask_th=0.85,
+            )
+        except:
+            line_256 = torch.zeros_like(items["mask_256"])
+        print(f"wireframe_forward time: {(time.time() - start) * 1000:.2f}ms")
+        # np_line = (line[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("line.jpg", np_line)
+        start = time.time()
+        edge_pred, line_pred = self.sample_edge_line_logits(
+            context=[items["img_256"], items["edge_256"], line_256],
+            mask=items["mask_256"].clone(),
+            iterations=self.sample_edge_line_iterations,
+            add_v=0.05,
+            mul_v=4,
+        )
+        print(f"sample_edge_line_logits time: {(time.time() - start) * 1000:.2f}ms")
+        # np_edge_pred = (edge_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("edge_pred.jpg", np_edge_pred)
+        # np_line_pred = (line_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("line_pred.jpg", np_line_pred)
+        # exit()
+        input_size = min(items["h"], items["w"])
+        if input_size != 256 and input_size > 256:
+            while edge_pred.shape[2] < input_size:
+                edge_pred = self.structure_upsample(edge_pred)
+                edge_pred = torch.sigmoid((edge_pred + 2) * 2)
+                line_pred = self.structure_upsample(line_pred)
+                line_pred = torch.sigmoid((line_pred + 2) * 2)
+            edge_pred = F.interpolate(
+                edge_pred,
+                size=(input_size, input_size),
+                mode="bilinear",
+                align_corners=False,
+            )
+            line_pred = F.interpolate(
+                line_pred,
+                size=(input_size, input_size),
+                mode="bilinear",
+                align_corners=False,
+            )
+        # np_edge_pred = (edge_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("edge_pred_upsample.jpg", np_edge_pred)
+        # np_line_pred = (line_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("line_pred_upsample.jpg", np_line_pred)
+        # exit()
+        items["edge"] = edge_pred.detach()
+        items["line"] = line_pred.detach()
+    @torch.no_grad()
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        mask = mask[:, :, 0]
+        items = load_image(image, mask, device=self.device)
+        self.wireframe_edge_and_line(items, config.zits_wireframe)
+        inpainted_image = self.inpaint(
+            items["images"],
+            items["masks"],
+            items["edge"],
+            items["line"],
+            items["rel_pos"],
+            items["direct"],
+        )
+        inpainted_image = inpainted_image * 255.0
+        inpainted_image = (
+            inpainted_image.cpu().permute(0, 2, 3, 1)[0].numpy().astype(np.uint8)
+        )
+        inpainted_image = inpainted_image[:, :, ::-1]
+        # cv2.imwrite("inpainted.jpg", inpainted_image)
+        # exit()
+        return inpainted_image
+    def wireframe_forward(self, images, h, w, masks, mask_th=0.925):
+        lcnn_mean = torch.tensor([109.730, 103.832, 98.681]).reshape(1, 3, 1, 1)
+        lcnn_std = torch.tensor([22.275, 22.124, 23.229]).reshape(1, 3, 1, 1)
+        images = images * 255.0
+        # the masks value of lcnn is 127.5
+        masked_images = images * (1 - masks) + torch.ones_like(images) * masks * 127.5
+        masked_images = (masked_images - lcnn_mean) / lcnn_std
+        def to_int(x):
+            return tuple(map(int, x))
+        lines_tensor = []
+        lmap = np.zeros((h, w))
+        output_masked = self.wireframe(masked_images)
+        output_masked = to_device(output_masked, "cpu")
+        if output_masked["num_proposals"] == 0:
+            lines_masked = []
+            scores_masked = []
+        else:
+            lines_masked = output_masked["lines_pred"].numpy()
+            lines_masked = [
+                [line[1] * h, line[0] * w, line[3] * h, line[2] * w]
+                for line in lines_masked
+            ]
+            scores_masked = output_masked["lines_score"].numpy()
+        for line, score in zip(lines_masked, scores_masked):
+            if score > mask_th:
+                try:
+                    import skimage
+                    rr, cc, value = skimage.draw.line_aa(
+                        *to_int(line[0:2]), *to_int(line[2:4])
+                    )
+                    lmap[rr, cc] = np.maximum(lmap[rr, cc], value)
+                except:
+                    cv2.line(
+                        lmap,
+                        to_int(line[0:2][::-1]),
+                        to_int(line[2:4][::-1]),
+                        (1, 1, 1),
+                        1,
+                        cv2.LINE_AA,
+                    )
+        lmap = np.clip(lmap * 255, 0, 255).astype(np.uint8)
+        lines_tensor.append(to_tensor(lmap).unsqueeze(0))
+        lines_tensor = torch.cat(lines_tensor, dim=0)
+        return lines_tensor.detach().to(self.device)
+    def sample_edge_line_logits(
+        self, context, mask=None, iterations=1, add_v=0, mul_v=4
+    ):
+        [img, edge, line] = context
+        img = img * (1 - mask)
+        edge = edge * (1 - mask)
+        line = line * (1 - mask)
+        for i in range(iterations):
+            edge_logits, line_logits = self.edge_line(img, edge, line, masks=mask)
+            edge_pred = torch.sigmoid(edge_logits)
+            line_pred = torch.sigmoid((line_logits + add_v) * mul_v)
+            edge = edge + edge_pred * mask
+            edge[edge >= 0.25] = 1
+            edge[edge < 0.25] = 0
+            line = line + line_pred * mask
+            b, _, h, w = edge_pred.shape
+            edge_pred = edge_pred.reshape(b, -1, 1)
+            line_pred = line_pred.reshape(b, -1, 1)
+            mask = mask.reshape(b, -1)
+            edge_probs = torch.cat([1 - edge_pred, edge_pred], dim=-1)
+            line_probs = torch.cat([1 - line_pred, line_pred], dim=-1)
+            edge_probs[:, :, 1] += 0.5
+            line_probs[:, :, 1] += 0.5
+            edge_max_probs = edge_probs.max(dim=-1)[0] + (1 - mask) * (-100)
+            line_max_probs = line_probs.max(dim=-1)[0] + (1 - mask) * (-100)
+            indices = torch.sort(
+                edge_max_probs + line_max_probs, dim=-1, descending=True
+            )[1]
+            for ii in range(b):
+                keep = int((i + 1) / iterations * torch.sum(mask[ii, ...]))
+                assert torch.sum(mask[ii][indices[ii, :keep]]) == keep, "Error!!!"
+                mask[ii][indices[ii, :keep]] = 0
+            mask = mask.reshape(b, 1, h, w)
+            edge = edge * (1 - mask)
+            line = line * (1 - mask)
+        edge, line = edge.to(torch.float32), line.to(torch.float32)
+        return edge, line

iopaint/plugins/segment_anything/modeling/tiny_vit_sam.py ADDED Viewed

	@@ -0,0 +1,822 @@

+# --------------------------------------------------------
+# TinyViT Model Architecture
+# Copyright (c) 2022 Microsoft
+# Adapted from LeViT and Swin Transformer
+#   LeViT: (https://github.com/facebookresearch/levit)
+#   Swin: (https://github.com/microsoft/swin-transformer)
+# Build the TinyViT Model
+# --------------------------------------------------------
+import collections
+import itertools
+import math
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from typing import Tuple
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(itertools.repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+    return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
+    applied while sampling the normal with mean/std applied, therefore a, b args
+    should be adjusted to match the range of mean, std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        return _trunc_normal_(tensor, mean, std, a, b)
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class TimmDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(TimmDropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(
+        self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1
+    ):
+        super().__init__()
+        self.add_module(
+            "c", torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False)
+        )
+        bn = torch.nn.BatchNorm2d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module("bn", bn)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
+        m = torch.nn.Conv2d(
+            w.size(1) * self.c.groups,
+            w.size(0),
+            w.shape[2:],
+            stride=self.c.stride,
+            padding=self.c.padding,
+            dilation=self.c.dilation,
+            groups=self.c.groups,
+        )
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class DropPath(TimmDropPath):
+    def __init__(self, drop_prob=None):
+        super().__init__(drop_prob=drop_prob)
+        self.drop_prob = drop_prob
+    def __repr__(self):
+        msg = super().__repr__()
+        msg += f"(drop_prob={self.drop_prob})"
+        return msg
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans, embed_dim, resolution, activation):
+        super().__init__()
+        img_size: Tuple[int, int] = to_2tuple(resolution)
+        self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
+        self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        n = embed_dim
+        self.seq = nn.Sequential(
+            Conv2d_BN(in_chans, n // 2, 3, 2, 1),
+            activation(),
+            Conv2d_BN(n // 2, n, 3, 2, 1),
+        )
+    def forward(self, x):
+        return self.seq(x)
+class MBConv(nn.Module):
+    def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
+        super().__init__()
+        self.in_chans = in_chans
+        self.hidden_chans = int(in_chans * expand_ratio)
+        self.out_chans = out_chans
+        self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
+        self.act1 = activation()
+        self.conv2 = Conv2d_BN(
+            self.hidden_chans,
+            self.hidden_chans,
+            ks=3,
+            stride=1,
+            pad=1,
+            groups=self.hidden_chans,
+        )
+        self.act2 = activation()
+        self.conv3 = Conv2d_BN(self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
+        self.act3 = activation()
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.act2(x)
+        x = self.conv3(x)
+        x = self.drop_path(x)
+        x += shortcut
+        x = self.act3(x)
+        return x
+class PatchMerging(nn.Module):
+    def __init__(self, input_resolution, dim, out_dim, activation):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.out_dim = out_dim
+        self.act = activation()
+        self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
+        stride_c = 2
+        if out_dim == 320 or out_dim == 448 or out_dim == 576:
+            stride_c = 1
+        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
+    def forward(self, x):
+        if x.ndim == 3:
+            H, W = self.input_resolution
+            B = len(x)
+            # (B, C, H, W)
+            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.act(x)
+        x = self.conv3(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        activation,
+        drop_path=0.0,
+        downsample=None,
+        use_checkpoint=False,
+        out_dim=None,
+        conv_expand_ratio=4.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                MBConv(
+                    dim,
+                    dim,
+                    conv_expand_ratio,
+                    activation,
+                    drop_path[i] if isinstance(drop_path, list) else drop_path,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.norm = nn.LayerNorm(in_features)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.act = act_layer()
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(torch.nn.Module):
+    def __init__(
+        self,
+        dim,
+        key_dim,
+        num_heads=8,
+        attn_ratio=4,
+        resolution=(14, 14),
+    ):
+        super().__init__()
+        # (h, w)
+        assert isinstance(resolution, tuple) and len(resolution) == 2
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.norm = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, h)
+        self.proj = nn.Linear(self.dh, dim)
+        points = list(itertools.product(range(resolution[0]), range(resolution[1])))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets))
+        )
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(idxs).view(N, N), persistent=False
+        )
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, "ab"):
+            del self.ab
+        else:
+            self.register_buffer(
+                "ab",
+                self.attention_biases[:, self.attention_bias_idxs],
+                persistent=False,
+            )
+    def forward(self, x):  # x (B,N,C)
+        B, N, _ = x.shape
+        # Normalization
+        x = self.norm(x)
+        qkv = self.qkv(x)
+        # (B, N, num_heads, d)
+        q, k, v = qkv.view(B, N, self.num_heads, -1).split(
+            [self.key_dim, self.key_dim, self.d], dim=3
+        )
+        # (B, num_heads, N, d)
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+        attn = (q @ k.transpose(-2, -1)) * self.scale + (
+            self.attention_biases[:, self.attention_bias_idxs]
+            if self.training
+            else self.ab
+        )
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = self.proj(x)
+        return x
+class TinyViTBlock(nn.Module):
+    r"""TinyViT Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int, int]): Input resolution.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        local_conv_size (int): the kernel size of the convolution between
+                               Attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
+        local_conv_size=3,
+        activation=nn.GELU,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        assert window_size > 0, "window_size must be greater than 0"
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        assert dim % num_heads == 0, "dim must be divisible by num_heads"
+        head_dim = dim // num_heads
+        window_resolution = (window_size, window_size)
+        self.attn = Attention(
+            dim, head_dim, num_heads, attn_ratio=1, resolution=window_resolution
+        )
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_activation = activation
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=mlp_activation,
+            drop=drop,
+        )
+        pad = local_conv_size // 2
+        self.local_conv = Conv2d_BN(
+            dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim
+        )
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        res_x = x
+        if H == self.window_size and W == self.window_size:
+            x = self.attn(x)
+        else:
+            x = x.view(B, H, W, C)
+            pad_b = (self.window_size - H % self.window_size) % self.window_size
+            pad_r = (self.window_size - W % self.window_size) % self.window_size
+            padding = pad_b > 0 or pad_r > 0
+            if padding:
+                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_size
+            nW = pW // self.window_size
+            # window partition
+            x = (
+                x.view(B, nH, self.window_size, nW, self.window_size, C)
+                .transpose(2, 3)
+                .reshape(B * nH * nW, self.window_size * self.window_size, C)
+            )
+            x = self.attn(x)
+            # window reverse
+            x = (
+                x.view(B, nH, nW, self.window_size, self.window_size, C)
+                .transpose(2, 3)
+                .reshape(B, pH, pW, C)
+            )
+            if padding:
+                x = x[:, :H, :W].contiguous()
+            x = x.view(B, L, C)
+        x = res_x + self.drop_path(x)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = self.local_conv(x)
+        x = x.view(B, C, L).transpose(1, 2)
+        x = x + self.drop_path(self.mlp(x))
+        return x
+    def extra_repr(self) -> str:
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+        )
+class BasicLayer(nn.Module):
+    """A basic TinyViT layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
+        activation: the activation function. Default: nn.GELU
+        out_dim: the output dimension of the layer. Default: dim
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
+        downsample=None,
+        use_checkpoint=False,
+        local_conv_size=3,
+        activation=nn.GELU,
+        out_dim=None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                TinyViTBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list)
+                    else drop_path,
+                    local_conv_size=local_conv_size,
+                    activation=activation,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class TinyViT(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        in_chans=3,
+        num_classes=1000,
+        embed_dims=[96, 192, 384, 768],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_sizes=[7, 7, 14, 7],
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        use_checkpoint=False,
+        mbconv_expand_ratio=4.0,
+        local_conv_size=3,
+        layer_lr_decay=1.0,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.mlp_ratio = mlp_ratio
+        activation = nn.GELU
+        self.patch_embed = PatchEmbed(
+            in_chans=in_chans,
+            embed_dim=embed_dims[0],
+            resolution=img_size,
+            activation=activation,
+        )
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            kwargs = dict(
+                dim=embed_dims[i_layer],
+                input_resolution=(
+                    patches_resolution[0]
+                    // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                    patches_resolution[1]
+                    // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                ),
+                #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                #                     patches_resolution[1] // (2 ** i_layer)),
+                depth=depths[i_layer],
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                out_dim=embed_dims[min(i_layer + 1, len(embed_dims) - 1)],
+                activation=activation,
+            )
+            if i_layer == 0:
+                layer = ConvLayer(
+                    conv_expand_ratio=mbconv_expand_ratio,
+                    **kwargs,
+                )
+            else:
+                layer = BasicLayer(
+                    num_heads=num_heads[i_layer],
+                    window_size=window_sizes[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    drop=drop_rate,
+                    local_conv_size=local_conv_size,
+                    **kwargs,
+                )
+            self.layers.append(layer)
+        # Classifier head
+        self.norm_head = nn.LayerNorm(embed_dims[-1])
+        self.head = (
+            nn.Linear(embed_dims[-1], num_classes)
+            if num_classes > 0
+            else torch.nn.Identity()
+        )
+        # init weights
+        self.apply(self._init_weights)
+        self.set_layer_lr_decay(layer_lr_decay)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dims[-1],
+                256,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+            nn.Conv2d(
+                256,
+                256,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(256),
+        )
+    def set_layer_lr_decay(self, layer_lr_decay):
+        decay_rate = layer_lr_decay
+        # layers -> blocks (depth)
+        depth = sum(self.depths)
+        lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
+        # print("LR SCALES:", lr_scales)
+        def _set_lr_scale(m, scale):
+            for p in m.parameters():
+                p.lr_scale = scale
+        self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
+        i = 0
+        for layer in self.layers:
+            for block in layer.blocks:
+                block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
+                i += 1
+            if layer.downsample is not None:
+                layer.downsample.apply(lambda x: _set_lr_scale(x, lr_scales[i - 1]))
+        assert i == depth
+        for m in [self.norm_head, self.head]:
+            m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
+        for k, p in self.named_parameters():
+            p.param_name = k
+        def _check_lr_scale(m):
+            for p in m.parameters():
+                assert hasattr(p, "lr_scale"), p.param_name
+        self.apply(_check_lr_scale)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {"attention_biases"}
+    def forward_features(self, x):
+        # x: (N, C, H, W)
+        x = self.patch_embed(x)
+        x = self.layers[0](x)
+        start_i = 1
+        for i in range(start_i, len(self.layers)):
+            layer = self.layers[i]
+            x = layer(x)
+        B, _, C = x.size()
+        x = x.view(B, 64, 64, C)
+        x = x.permute(0, 3, 1, 2)
+        x = self.neck(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.norm_head(x)
+        # x = self.head(x)
+        return x

iopaint/plugins/segment_anything/modeling/transformer.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import Tensor, nn
+import math
+from typing import Tuple, Type
+from .common import MLPBlock
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attenion layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

iopaint/plugins/segment_anything/utils/transforms.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
+from copy import deepcopy
+from typing import Tuple
+class ResizeLongestSide:
+    """
+    Resizes images to longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes. Provides methods for
+    transforming both numpy array and batched torch tensors.
+    """
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = self.get_preprocess_shape(
+            image.shape[0], image.shape[1], self.target_length
+        )
+        return np.array(resize(to_pil_image(image), target_size))
+    def apply_coords(
+        self, coords: np.ndarray, original_size: Tuple[int, ...]
+    ) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes(
+        self, boxes: np.ndarray, original_size: Tuple[int, ...]
+    ) -> np.ndarray:
+        """
+        Expects a numpy array shape Bx4. Requires the original image size
+        in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Expects batched images with shape BxCxHxW and float format. This
+        transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(
+            image.shape[0], image.shape[1], self.target_length
+        )
+        return F.interpolate(
+            image, target_size, mode="bilinear", align_corners=False, antialias=True
+        )
+    def apply_coords_torch(
+        self, coords: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes_torch(
+        self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with shape Bx4. Requires the original image
+        size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    @staticmethod
+    def get_preprocess_shape(
+        oldh: int, oldw: int, long_side_length: int
+    ) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target long side length.
+        """
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)

iopaint/tests/test_sdxl.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+from iopaint.tests.utils import check_device, current_dir
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+import pytest
+import torch
+from iopaint.model_manager import ModelManager
+from iopaint.schema import HDStrategy, SDSampler, FREEUConfig
+from iopaint.tests.test_model import get_config, assert_equal
+@pytest.mark.parametrize("device", ["cuda", "mps"])
+@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL])
+@pytest.mark.parametrize("sampler", [SDSampler.ddim])
+def test_sdxl(device, strategy, sampler):
+    sd_steps = check_device(device)
+    model = ModelManager(
+        name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        device=torch.device(device),
+        disable_nsfw=True,
+        sd_cpu_textencoder=False,
+    )
+    cfg = get_config(
+        strategy=strategy,
+        prompt="face of a fox, sitting on a bench",
+        sd_steps=sd_steps,
+        sd_strength=1.0,
+        sd_guidance_scale=7.0,
+    )
+    cfg.sd_sampler = sampler
+    assert_equal(
+        model,
+        cfg,
+        f"sdxl_device_{device}.png",
+        img_p=current_dir / "overture-creations-5sI6fQgYIuo.png",
+        mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png",
+        fx=2,
+        fy=2,
+    )
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL])
+@pytest.mark.parametrize("sampler", [SDSampler.ddim])
+def test_sdxl_cpu_text_encoder(device, strategy, sampler):
+    sd_steps = check_device(device)
+    model = ModelManager(
+        name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        device=torch.device(device),
+        disable_nsfw=True,
+        sd_cpu_textencoder=True,
+    )
+    cfg = get_config(
+        strategy=strategy,
+        prompt="face of a fox, sitting on a bench",
+        sd_steps=sd_steps,
+        sd_strength=1.0,
+        sd_guidance_scale=7.0,
+    )
+    cfg.sd_sampler = sampler
+    assert_equal(
+        model,
+        cfg,
+        f"sdxl_device_{device}.png",
+        img_p=current_dir / "overture-creations-5sI6fQgYIuo.png",
+        mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png",
+        fx=2,
+        fy=2,
+    )
+@pytest.mark.parametrize("device", ["cuda", "mps"])
+@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL])
+@pytest.mark.parametrize("sampler", [SDSampler.ddim])
+def test_sdxl_lcm_lora_and_freeu(device, strategy, sampler):
+    sd_steps = check_device(device)
+    model = ModelManager(
+        name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        device=torch.device(device),
+        disable_nsfw=True,
+        sd_cpu_textencoder=False,
+    )
+    cfg = get_config(
+        strategy=strategy,
+        prompt="face of a fox, sitting on a bench",
+        sd_steps=sd_steps,
+        sd_strength=1.0,
+        sd_guidance_scale=2.0,
+        sd_lcm_lora=True,
+    )
+    cfg.sd_sampler = sampler
+    name = f"device_{device}_{sampler}"
+    assert_equal(
+        model,
+        cfg,
+        f"sdxl_{name}_lcm_lora.png",
+        img_p=current_dir / "overture-creations-5sI6fQgYIuo.png",
+        mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png",
+        fx=2,
+        fy=2,
+    )
+    cfg = get_config(
+        strategy=strategy,
+        prompt="face of a fox, sitting on a bench",
+        sd_steps=sd_steps,
+        sd_guidance_scale=7.5,
+        sd_freeu=True,
+        sd_freeu_config=FREEUConfig(),
+    )
+    assert_equal(
+        model,
+        cfg,
+        f"sdxl_{name}_freeu_device_{device}.png",
+        img_p=current_dir / "overture-creations-5sI6fQgYIuo.png",
+        mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png",
+        fx=2,
+        fy=2,
+    )
+@pytest.mark.parametrize("device", ["cuda", "mps"])
+@pytest.mark.parametrize(
+    "rect",
+    [
+        [-128, -128, 1024, 1024],
+    ],
+)
+def test_sdxl_outpainting(device, rect):
+    sd_steps = check_device(device)
+    model = ModelManager(
+        name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        device=torch.device(device),
+        disable_nsfw=True,
+        sd_cpu_textencoder=False,
+    )
+    cfg = get_config(
+        strategy=HDStrategy.ORIGINAL,
+        prompt="a dog sitting on a bench in the park",
+        sd_steps=sd_steps,
+        use_extender=True,
+        extender_x=rect[0],
+        extender_y=rect[1],
+        extender_width=rect[2],
+        extender_height=rect[3],
+        sd_strength=1.0,
+        sd_guidance_scale=8.0,
+        sd_sampler=SDSampler.ddim,
+    )
+    assert_equal(
+        model,
+        cfg,
+        f"sdxl_outpainting_dog_ddim_{'_'.join(map(str, rect))}_device_{device}.png",
+        img_p=current_dir / "overture-creations-5sI6fQgYIuo.png",
+        mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png",
+        fx=1.5,
+        fy=1.5,
+    )

iopaint/tests/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from pathlib import Path
+import cv2
+import pytest
+import torch
+from iopaint.helper import encode_pil_to_base64
+from iopaint.schema import LDMSampler, HDStrategy, InpaintRequest, SDSampler
+from PIL import Image
+current_dir = Path(__file__).parent.absolute().resolve()
+save_dir = current_dir / "result"
+save_dir.mkdir(exist_ok=True, parents=True)
+def check_device(device: str) -> int:
+    if device == "cuda" and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available, skip test on cuda")
+    if device == "mps" and not torch.backends.mps.is_available():
+        pytest.skip("mps is not available, skip test on mps")
+    steps = 2 if device == "cpu" else 20
+    return steps
+def assert_equal(
+    model,
+    config: InpaintRequest,
+    gt_name,
+    fx: float = 1,
+    fy: float = 1,
+    img_p=current_dir / "image.png",
+    mask_p=current_dir / "mask.png",
+):
+    img, mask = get_data(fx=fx, fy=fy, img_p=img_p, mask_p=mask_p)
+    print(f"Input image shape: {img.shape}")
+    res = model(img, mask, config)
+    ok = cv2.imwrite(
+        str(save_dir / gt_name),
+        res,
+        [int(cv2.IMWRITE_JPEG_QUALITY), 100, int(cv2.IMWRITE_PNG_COMPRESSION), 0],
+    )
+    assert ok, save_dir / gt_name
+    """
+    Note that JPEG is lossy compression, so even if it is the highest quality 100,
+    when the saved images is reloaded, a difference occurs with the original pixel value.
+    If you want to save the original images as it is, save it as PNG or BMP.
+    """
+    # gt = cv2.imread(str(current_dir / gt_name), cv2.IMREAD_UNCHANGED)
+    # assert np.array_equal(res, gt)
+def get_data(
+    fx: float = 1,
+    fy: float = 1.0,
+    img_p=current_dir / "image.png",
+    mask_p=current_dir / "mask.png",
+):
+    img = cv2.imread(str(img_p))
+    img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGB)
+    mask = cv2.imread(str(mask_p), cv2.IMREAD_GRAYSCALE)
+    img = cv2.resize(img, None, fx=fx, fy=fy, interpolation=cv2.INTER_AREA)
+    mask = cv2.resize(mask, None, fx=fx, fy=fy, interpolation=cv2.INTER_NEAREST)
+    return img, mask
+def get_config(**kwargs):
+    data = dict(
+        sd_sampler=kwargs.get("sd_sampler", SDSampler.uni_pc),
+        ldm_steps=1,
+        ldm_sampler=LDMSampler.plms,
+        hd_strategy=kwargs.get("strategy", HDStrategy.ORIGINAL),
+        hd_strategy_crop_margin=32,
+        hd_strategy_crop_trigger_size=200,
+        hd_strategy_resize_limit=200,
+    )
+    data.update(**kwargs)
+    return InpaintRequest(image="", mask="", **data)

iopaint/web_config.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import json
+import os
+from pathlib import Path
+from iopaint.schema import (
+    Device,
+    InteractiveSegModel,
+    RemoveBGModel,
+    RealESRGANModel,
+    ApiConfig,
+)
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
+from datetime import datetime
+from json import JSONDecodeError
+import gradio as gr
+from iopaint.download import scan_models
+from loguru import logger
+from iopaint.const import *
+_config_file: Path = None
+default_configs = dict(
+    host="127.0.0.1",
+    port=8080,
+    inbrowser=True,
+    model=DEFAULT_MODEL,
+    model_dir=DEFAULT_MODEL_DIR,
+    no_half=False,
+    low_mem=False,
+    cpu_offload=False,
+    disable_nsfw_checker=False,
+    local_files_only=False,
+    cpu_textencoder=False,
+    device=Device.cuda,
+    input=None,
+    output_dir=None,
+    quality=95,
+    enable_interactive_seg=False,
+    interactive_seg_model=InteractiveSegModel.vit_b,
+    interactive_seg_device=Device.cpu,
+    enable_remove_bg=False,
+    remove_bg_model=RemoveBGModel.briaai_rmbg_1_4,
+    enable_anime_seg=False,
+    enable_realesrgan=False,
+    realesrgan_device=Device.cpu,
+    realesrgan_model=RealESRGANModel.realesr_general_x4v3,
+    enable_gfpgan=False,
+    gfpgan_device=Device.cpu,
+    enable_restoreformer=False,
+    restoreformer_device=Device.cpu,
+)
+class WebConfig(ApiConfig):
+    model_dir: str = DEFAULT_MODEL_DIR
+def load_config(p: Path) -> WebConfig:
+    if p.exists():
+        with open(p, "r", encoding="utf-8") as f:
+            try:
+                return WebConfig(**{**default_configs, **json.load(f)})
+            except JSONDecodeError:
+                print(f"Load config file failed, using default configs")
+                return WebConfig(**default_configs)
+    else:
+        return WebConfig(**default_configs)
+def save_config(
+    host,
+    port,
+    model,
+    model_dir,
+    no_half,
+    low_mem,
+    cpu_offload,
+    disable_nsfw_checker,
+    local_files_only,
+    cpu_textencoder,
+    device,
+    input,
+    output_dir,
+    quality,
+    enable_interactive_seg,
+    interactive_seg_model,
+    interactive_seg_device,
+    enable_remove_bg,
+    remove_bg_model,
+    enable_anime_seg,
+    enable_realesrgan,
+    realesrgan_device,
+    realesrgan_model,
+    enable_gfpgan,
+    gfpgan_device,
+    enable_restoreformer,
+    restoreformer_device,
+    inbrowser,
+):
+    config = WebConfig(**locals())
+    if str(config.input) == ".":
+        config.input = None
+    if str(config.output_dir) == ".":
+        config.output_dir = None
+    config.model = config.model.strip()
+    print(config.model_dump_json(indent=4))
+    if config.input and not os.path.exists(config.input):
+        return "[Error] Input file or directory does not exist"
+    current_time = datetime.now().strftime("%H:%M:%S")
+    msg = f"[{current_time}] Successful save config to: {str(_config_file.absolute())}"
+    logger.info(msg)
+    try:
+        with open(_config_file, "w", encoding="utf-8") as f:
+            f.write(config.model_dump_json(indent=4))
+    except Exception as e:
+        return f"Save configure file failed: {str(e)}"
+    return msg
+def change_current_model(new_model):
+    return new_model
+def main(config_file: Path):
+    global _config_file
+    _config_file = config_file
+    init_config = load_config(config_file)
+    downloaded_models = [it.name for it in scan_models()]
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                gr.Textbox(config_file, label="Config file", interactive=False)
+            with gr.Column():
+                save_btn = gr.Button(value="Save configurations")
+                message = gr.HTML()
+        with gr.Tabs():
+            with gr.Tab("Common"):
+                with gr.Row():
+                    host = gr.Textbox(init_config.host, label="Host")
+                    port = gr.Number(init_config.port, label="Port", precision=0)
+                    inbrowser = gr.Checkbox(init_config.inbrowser, label=INBROWSER_HELP)
+                with gr.Column():
+                    model = gr.Textbox(
+                        init_config.model,
+                        label="Current Model. This is the model that will be used when the service starts. "
+                        "If the model has not been downloaded before, it will be automatically downloaded. "
+                        "You can select a model from the dropdown box below or manually enter the SD/SDXL model ID from HuggingFace, for example, runwayml/stable-diffusion-inpainting.",
+                    )
+                with gr.Row():
+                    recommend_model = gr.Dropdown(
+                        ["lama", "mat", "migan"] + DIFFUSION_MODELS,
+                        label="Recommended Models",
+                    )
+                    downloaded_model = gr.Dropdown(
+                        downloaded_models, label="Downloaded Models"
+                    )
+                device = gr.Radio(
+                    Device.values(), label="Device", value=init_config.device
+                )
+                quality = gr.Slider(
+                    value=95,
+                    label=f"Image Quality ({QUALITY_HELP})",
+                    minimum=75,
+                    maximum=100,
+                    step=1,
+                )
+                no_half = gr.Checkbox(init_config.no_half, label=f"{NO_HALF_HELP}")
+                cpu_offload = gr.Checkbox(
+                    init_config.cpu_offload, label=f"{CPU_OFFLOAD_HELP}"
+                )
+                low_mem = gr.Checkbox(init_config.low_mem, label=f"{LOW_MEM_HELP}")
+                cpu_textencoder = gr.Checkbox(
+                    init_config.cpu_textencoder, label=f"{CPU_TEXTENCODER_HELP}"
+                )
+                disable_nsfw_checker = gr.Checkbox(
+                    init_config.disable_nsfw_checker, label=f"{DISABLE_NSFW_HELP}"
+                )
+                local_files_only = gr.Checkbox(
+                    init_config.local_files_only, label=f"{LOCAL_FILES_ONLY_HELP}"
+                )
+                with gr.Column():
+                    model_dir = gr.Textbox(
+                        init_config.model_dir, label=f"{MODEL_DIR_HELP}"
+                    )
+                    input = gr.Textbox(
+                        init_config.input,
+                        label=f"Input file or directory. {INPUT_HELP}",
+                    )
+                    output_dir = gr.Textbox(
+                        init_config.output_dir,
+                        label=f"Output directory. {OUTPUT_DIR_HELP}",
+                    )
+            with gr.Tab("Plugins"):
+                with gr.Row():
+                    enable_interactive_seg = gr.Checkbox(
+                        init_config.enable_interactive_seg, label=INTERACTIVE_SEG_HELP
+                    )
+                    interactive_seg_model = gr.Radio(
+                        InteractiveSegModel.values(),
+                        label=f"Segment Anything models. {INTERACTIVE_SEG_MODEL_HELP}",
+                        value=init_config.interactive_seg_model,
+                    )
+                    interactive_seg_device = gr.Radio(
+                        Device.values(),
+                        label="Segment Anything Device",
+                        value=init_config.interactive_seg_device,
+                    )
+                with gr.Row():
+                    enable_remove_bg = gr.Checkbox(
+                        init_config.enable_remove_bg, label=REMOVE_BG_HELP
+                    )
+                    remove_bg_model = gr.Radio(
+                        RemoveBGModel.values(),
+                        label="Remove bg model",
+                        value=init_config.remove_bg_model,
+                    )
+                with gr.Row():
+                    enable_anime_seg = gr.Checkbox(
+                        init_config.enable_anime_seg, label=ANIMESEG_HELP
+                    )
+                with gr.Row():
+                    enable_realesrgan = gr.Checkbox(
+                        init_config.enable_realesrgan, label=REALESRGAN_HELP
+                    )
+                    realesrgan_device = gr.Radio(
+                        Device.values(),
+                        label="RealESRGAN Device",
+                        value=init_config.realesrgan_device,
+                    )
+                    realesrgan_model = gr.Radio(
+                        RealESRGANModel.values(),
+                        label="RealESRGAN model",
+                        value=init_config.realesrgan_model,
+                    )
+                with gr.Row():
+                    enable_gfpgan = gr.Checkbox(
+                        init_config.enable_gfpgan, label=GFPGAN_HELP
+                    )
+                    gfpgan_device = gr.Radio(
+                        Device.values(),
+                        label="GFPGAN Device",
+                        value=init_config.gfpgan_device,
+                    )
+                with gr.Row():
+                    enable_restoreformer = gr.Checkbox(
+                        init_config.enable_restoreformer, label=RESTOREFORMER_HELP
+                    )
+                    restoreformer_device = gr.Radio(
+                        Device.values(),
+                        label="RestoreFormer Device",
+                        value=init_config.restoreformer_device,
+                    )
+        downloaded_model.change(change_current_model, [downloaded_model], model)
+        recommend_model.change(change_current_model, [recommend_model], model)
+        save_btn.click(
+            save_config,
+            [
+                host,
+                port,
+                model,
+                model_dir,
+                no_half,
+                low_mem,
+                cpu_offload,
+                disable_nsfw_checker,
+                local_files_only,
+                cpu_textencoder,
+                device,
+                input,
+                output_dir,
+                quality,
+                enable_interactive_seg,
+                interactive_seg_model,
+                interactive_seg_device,
+                enable_remove_bg,
+                remove_bg_model,
+                enable_anime_seg,
+                enable_realesrgan,
+                realesrgan_device,
+                realesrgan_model,
+                enable_gfpgan,
+                gfpgan_device,
+                enable_restoreformer,
+                restoreformer_device,
+                inbrowser,
+            ],
+            message,
+        )
+    demo.launch(inbrowser=True, show_api=False)

pretrained-model/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

pretrained-model/version_diffusers_cache.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

utils/tools.py ADDED Viewed

	@@ -0,0 +1,505 @@

+import os
+import torch
+import yaml
+import numpy as np
+from PIL import Image
+import torch.nn.functional as F
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+def default_loader(path):
+    return pil_loader(path)
+def tensor_img_to_npimg(tensor_img):
+    """
+    Turn a tensor image with shape CxHxW to a numpy array image with shape HxWxC
+    :param tensor_img:
+    :return: a numpy array image with shape HxWxC
+    """
+    if not (torch.is_tensor(tensor_img) and tensor_img.ndimension() == 3):
+        raise NotImplementedError("Not supported tensor image. Only tensors with dimension CxHxW are supported.")
+    npimg = np.transpose(tensor_img.numpy(), (1, 2, 0))
+    npimg = npimg.squeeze()
+    assert isinstance(npimg, np.ndarray) and (npimg.ndim in {2, 3})
+    return npimg
+# Change the values of tensor x from range [0, 1] to [-1, 1]
+def normalize(x):
+    return x.mul_(2).add_(-1)
+def same_padding(images, ksizes, strides, rates):
+    assert len(images.size()) == 4
+    batch_size, channel, rows, cols = images.size()
+    out_rows = (rows + strides[0] - 1) // strides[0]
+    out_cols = (cols + strides[1] - 1) // strides[1]
+    effective_k_row = (ksizes[0] - 1) * rates[0] + 1
+    effective_k_col = (ksizes[1] - 1) * rates[1] + 1
+    padding_rows = max(0, (out_rows-1)*strides[0]+effective_k_row-rows)
+    padding_cols = max(0, (out_cols-1)*strides[1]+effective_k_col-cols)
+    # Pad the input
+    padding_top = int(padding_rows / 2.)
+    padding_left = int(padding_cols / 2.)
+    padding_bottom = padding_rows - padding_top
+    padding_right = padding_cols - padding_left
+    paddings = (padding_left, padding_right, padding_top, padding_bottom)
+    images = torch.nn.ZeroPad2d(paddings)(images)
+    return images
+def extract_image_patches(images, ksizes, strides, rates, padding='same'):
+    """
+    Extract patches from images and put them in the C output dimension.
+    :param padding:
+    :param images: [batch, channels, in_rows, in_cols]. A 4-D Tensor with shape
+    :param ksizes: [ksize_rows, ksize_cols]. The size of the sliding window for
+     each dimension of images
+    :param strides: [stride_rows, stride_cols]
+    :param rates: [dilation_rows, dilation_cols]
+    :return: A Tensor
+    """
+    assert len(images.size()) == 4
+    assert padding in ['same', 'valid']
+    batch_size, channel, height, width = images.size()
+    if padding == 'same':
+        images = same_padding(images, ksizes, strides, rates)
+    elif padding == 'valid':
+        pass
+    else:
+        raise NotImplementedError('Unsupported padding type: {}.\
+                Only "same" or "valid" are supported.'.format(padding))
+    unfold = torch.nn.Unfold(kernel_size=ksizes,
+                             dilation=rates,
+                             padding=0,
+                             stride=strides)
+    patches = unfold(images)
+    return patches  # [N, C*k*k, L], L is the total number of such blocks
+def random_bbox(config, batch_size):
+    """Generate a random tlhw with configuration.
+    Args:
+        config: Config should have configuration including img
+    Returns:
+        tuple: (top, left, height, width)
+    """
+    img_height, img_width, _ = config['image_shape']
+    h, w = config['mask_shape']
+    margin_height, margin_width = config['margin']
+    maxt = img_height - margin_height - h
+    maxl = img_width - margin_width - w
+    bbox_list = []
+    if config['mask_batch_same']:
+        t = np.random.randint(margin_height, maxt)
+        l = np.random.randint(margin_width, maxl)
+        bbox_list.append((t, l, h, w))
+        bbox_list = bbox_list * batch_size
+    else:
+        for i in range(batch_size):
+            t = np.random.randint(margin_height, maxt)
+            l = np.random.randint(margin_width, maxl)
+            bbox_list.append((t, l, h, w))
+    return torch.tensor(bbox_list, dtype=torch.int64)
+def test_random_bbox():
+    image_shape = [256, 256, 3]
+    mask_shape = [128, 128]
+    margin = [0, 0]
+    bbox = random_bbox(image_shape)
+    return bbox
+def bbox2mask(bboxes, height, width, max_delta_h, max_delta_w):
+    batch_size = bboxes.size(0)
+    mask = torch.zeros((batch_size, 1, height, width), dtype=torch.float32)
+    for i in range(batch_size):
+        bbox = bboxes[i]
+        delta_h = np.random.randint(max_delta_h // 2 + 1)
+        delta_w = np.random.randint(max_delta_w // 2 + 1)
+        mask[i, :, bbox[0] + delta_h:bbox[0] + bbox[2] - delta_h, bbox[1] + delta_w:bbox[1] + bbox[3] - delta_w] = 1.
+    return mask
+def test_bbox2mask():
+    image_shape = [256, 256, 3]
+    mask_shape = [128, 128]
+    margin = [0, 0]
+    max_delta_shape = [32, 32]
+    bbox = random_bbox(image_shape)
+    mask = bbox2mask(bbox, image_shape[0], image_shape[1], max_delta_shape[0], max_delta_shape[1])
+    return mask
+def local_patch(x, bbox_list):
+    assert len(x.size()) == 4
+    patches = []
+    for i, bbox in enumerate(bbox_list):
+        t, l, h, w = bbox
+        patches.append(x[i, :, t:t + h, l:l + w])
+    return torch.stack(patches, dim=0)
+def mask_image(x, bboxes, config):
+    height, width, _ = config['image_shape']
+    max_delta_h, max_delta_w = config['max_delta_shape']
+    mask = bbox2mask(bboxes, height, width, max_delta_h, max_delta_w)
+    if x.is_cuda:
+        mask = mask.cuda()
+    if config['mask_type'] == 'hole':
+        result = x * (1. - mask)
+    elif config['mask_type'] == 'mosaic':
+        # TODO: Matching the mosaic patch size and the mask size
+        mosaic_unit_size = config['mosaic_unit_size']
+        downsampled_image = F.interpolate(x, scale_factor=1. / mosaic_unit_size, mode='nearest')
+        upsampled_image = F.interpolate(downsampled_image, size=(height, width), mode='nearest')
+        result = upsampled_image * mask + x * (1. - mask)
+    else:
+        raise NotImplementedError('Not implemented mask type.')
+    return result, mask
+def spatial_discounting_mask(config):
+    """Generate spatial discounting mask constant.
+    Spatial discounting mask is first introduced in publication:
+        Generative Image Inpainting with Contextual Attention, Yu et al.
+    Args:
+        config: Config should have configuration including HEIGHT, WIDTH,
+            DISCOUNTED_MASK.
+    Returns:
+        tf.Tensor: spatial discounting mask
+    """
+    gamma = config['spatial_discounting_gamma']
+    height, width = config['mask_shape']
+    shape = [1, 1, height, width]
+    if config['discounted_mask']:
+        mask_values = np.ones((height, width))
+        for i in range(height):
+            for j in range(width):
+                mask_values[i, j] = max(
+                    gamma ** min(i, height - i),
+                    gamma ** min(j, width - j))
+        mask_values = np.expand_dims(mask_values, 0)
+        mask_values = np.expand_dims(mask_values, 0)
+    else:
+        mask_values = np.ones(shape)
+    spatial_discounting_mask_tensor = torch.tensor(mask_values, dtype=torch.float32)
+    if config['cuda']:
+        spatial_discounting_mask_tensor = spatial_discounting_mask_tensor.cuda()
+    return spatial_discounting_mask_tensor
+def reduce_mean(x, axis=None, keepdim=False):
+    if not axis:
+        axis = range(len(x.shape))
+    for i in sorted(axis, reverse=True):
+        x = torch.mean(x, dim=i, keepdim=keepdim)
+    return x
+def reduce_std(x, axis=None, keepdim=False):
+    if not axis:
+        axis = range(len(x.shape))
+    for i in sorted(axis, reverse=True):
+        x = torch.std(x, dim=i, keepdim=keepdim)
+    return x
+def reduce_sum(x, axis=None, keepdim=False):
+    if not axis:
+        axis = range(len(x.shape))
+    for i in sorted(axis, reverse=True):
+        x = torch.sum(x, dim=i, keepdim=keepdim)
+    return x
+def flow_to_image(flow):
+    """Transfer flow map to image.
+    Part of code forked from flownet.
+    """
+    out = []
+    maxu = -999.
+    maxv = -999.
+    minu = 999.
+    minv = 999.
+    maxrad = -1
+    for i in range(flow.shape[0]):
+        u = flow[i, :, :, 0]
+        v = flow[i, :, :, 1]
+        idxunknow = (abs(u) > 1e7) | (abs(v) > 1e7)
+        u[idxunknow] = 0
+        v[idxunknow] = 0
+        maxu = max(maxu, np.max(u))
+        minu = min(minu, np.min(u))
+        maxv = max(maxv, np.max(v))
+        minv = min(minv, np.min(v))
+        rad = np.sqrt(u ** 2 + v ** 2)
+        maxrad = max(maxrad, np.max(rad))
+        u = u / (maxrad + np.finfo(float).eps)
+        v = v / (maxrad + np.finfo(float).eps)
+        img = compute_color(u, v)
+        out.append(img)
+    return np.float32(np.uint8(out))
+def pt_flow_to_image(flow):
+    """Transfer flow map to image.
+    Part of code forked from flownet.
+    """
+    out = []
+    maxu = torch.tensor(-999)
+    maxv = torch.tensor(-999)
+    minu = torch.tensor(999)
+    minv = torch.tensor(999)
+    maxrad = torch.tensor(-1)
+    if torch.cuda.is_available():
+        maxu = maxu.cuda()
+        maxv = maxv.cuda()
+        minu = minu.cuda()
+        minv = minv.cuda()
+        maxrad = maxrad.cuda()
+    for i in range(flow.shape[0]):
+        u = flow[i, 0, :, :]
+        v = flow[i, 1, :, :]
+        idxunknow = (torch.abs(u) > 1e7) + (torch.abs(v) > 1e7)
+        u[idxunknow] = 0
+        v[idxunknow] = 0
+        maxu = torch.max(maxu, torch.max(u))
+        minu = torch.min(minu, torch.min(u))
+        maxv = torch.max(maxv, torch.max(v))
+        minv = torch.min(minv, torch.min(v))
+        rad = torch.sqrt((u ** 2 + v ** 2).float()).to(torch.int64)
+        maxrad = torch.max(maxrad, torch.max(rad))
+        u = u / (maxrad + torch.finfo(torch.float32).eps)
+        v = v / (maxrad + torch.finfo(torch.float32).eps)
+        # TODO: change the following to pytorch
+        img = pt_compute_color(u, v)
+        out.append(img)
+    return torch.stack(out, dim=0)
+def highlight_flow(flow):
+    """Convert flow into middlebury color code image.
+    """
+    out = []
+    s = flow.shape
+    for i in range(flow.shape[0]):
+        img = np.ones((s[1], s[2], 3)) * 144.
+        u = flow[i, :, :, 0]
+        v = flow[i, :, :, 1]
+        for h in range(s[1]):
+            for w in range(s[1]):
+                ui = u[h, w]
+                vi = v[h, w]
+                img[ui, vi, :] = 255.
+        out.append(img)
+    return np.float32(np.uint8(out))
+def pt_highlight_flow(flow):
+    """Convert flow into middlebury color code image.
+        """
+    out = []
+    s = flow.shape
+    for i in range(flow.shape[0]):
+        img = np.ones((s[1], s[2], 3)) * 144.
+        u = flow[i, :, :, 0]
+        v = flow[i, :, :, 1]
+        for h in range(s[1]):
+            for w in range(s[1]):
+                ui = u[h, w]
+                vi = v[h, w]
+                img[ui, vi, :] = 255.
+        out.append(img)
+    return np.float32(np.uint8(out))
+def compute_color(u, v):
+    h, w = u.shape
+    img = np.zeros([h, w, 3])
+    nanIdx = np.isnan(u) | np.isnan(v)
+    u[nanIdx] = 0
+    v[nanIdx] = 0
+    # colorwheel = COLORWHEEL
+    colorwheel = make_color_wheel()
+    ncols = np.size(colorwheel, 0)
+    rad = np.sqrt(u ** 2 + v ** 2)
+    a = np.arctan2(-v, -u) / np.pi
+    fk = (a + 1) / 2 * (ncols - 1) + 1
+    k0 = np.floor(fk).astype(int)
+    k1 = k0 + 1
+    k1[k1 == ncols + 1] = 1
+    f = fk - k0
+    for i in range(np.size(colorwheel, 1)):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0 - 1] / 255
+        col1 = tmp[k1 - 1] / 255
+        col = (1 - f) * col0 + f * col1
+        idx = rad <= 1
+        col[idx] = 1 - rad[idx] * (1 - col[idx])
+        notidx = np.logical_not(idx)
+        col[notidx] *= 0.75
+        img[:, :, i] = np.uint8(np.floor(255 * col * (1 - nanIdx)))
+    return img
+def pt_compute_color(u, v):
+    h, w = u.shape
+    img = torch.zeros([3, h, w])
+    if torch.cuda.is_available():
+        img = img.cuda()
+    nanIdx = (torch.isnan(u) + torch.isnan(v)) != 0
+    u[nanIdx] = 0.
+    v[nanIdx] = 0.
+    # colorwheel = COLORWHEEL
+    colorwheel = pt_make_color_wheel()
+    if torch.cuda.is_available():
+        colorwheel = colorwheel.cuda()
+    ncols = colorwheel.size()[0]
+    rad = torch.sqrt((u ** 2 + v ** 2).to(torch.float32))
+    a = torch.atan2(-v.to(torch.float32), -u.to(torch.float32)) / np.pi
+    fk = (a + 1) / 2 * (ncols - 1) + 1
+    k0 = torch.floor(fk).to(torch.int64)
+    k1 = k0 + 1
+    k1[k1 == ncols + 1] = 1
+    f = fk - k0.to(torch.float32)
+    for i in range(colorwheel.size()[1]):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0 - 1]
+        col1 = tmp[k1 - 1]
+        col = (1 - f) * col0 + f * col1
+        idx = rad <= 1. / 255.
+        col[idx] = 1 - rad[idx] * (1 - col[idx])
+        notidx = (idx != 0)
+        col[notidx] *= 0.75
+        img[i, :, :] = col * (1 - nanIdx).to(torch.float32)
+    return img
+def make_color_wheel():
+    RY, YG, GC, CB, BM, MR = (15, 6, 4, 11, 13, 6)
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros([ncols, 3])
+    col = 0
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.transpose(np.floor(255 * np.arange(0, RY) / RY))
+    col += RY
+    # YG
+    colorwheel[col:col + YG, 0] = 255 - np.transpose(np.floor(255 * np.arange(0, YG) / YG))
+    colorwheel[col:col + YG, 1] = 255
+    col += YG
+    # GC
+    colorwheel[col:col + GC, 1] = 255
+    colorwheel[col:col + GC, 2] = np.transpose(np.floor(255 * np.arange(0, GC) / GC))
+    col += GC
+    # CB
+    colorwheel[col:col + CB, 1] = 255 - np.transpose(np.floor(255 * np.arange(0, CB) / CB))
+    colorwheel[col:col + CB, 2] = 255
+    col += CB
+    # BM
+    colorwheel[col:col + BM, 2] = 255
+    colorwheel[col:col + BM, 0] = np.transpose(np.floor(255 * np.arange(0, BM) / BM))
+    col += + BM
+    # MR
+    colorwheel[col:col + MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
+    colorwheel[col:col + MR, 0] = 255
+    return colorwheel
+def pt_make_color_wheel():
+    RY, YG, GC, CB, BM, MR = (15, 6, 4, 11, 13, 6)
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = torch.zeros([ncols, 3])
+    col = 0
+    # RY
+    colorwheel[0:RY, 0] = 1.
+    colorwheel[0:RY, 1] = torch.arange(0, RY, dtype=torch.float32) / RY
+    col += RY
+    # YG
+    colorwheel[col:col + YG, 0] = 1. - (torch.arange(0, YG, dtype=torch.float32) / YG)
+    colorwheel[col:col + YG, 1] = 1.
+    col += YG
+    # GC
+    colorwheel[col:col + GC, 1] = 1.
+    colorwheel[col:col + GC, 2] = torch.arange(0, GC, dtype=torch.float32) / GC
+    col += GC
+    # CB
+    colorwheel[col:col + CB, 1] = 1. - (torch.arange(0, CB, dtype=torch.float32) / CB)
+    colorwheel[col:col + CB, 2] = 1.
+    col += CB
+    # BM
+    colorwheel[col:col + BM, 2] = 1.
+    colorwheel[col:col + BM, 0] = torch.arange(0, BM, dtype=torch.float32) / BM
+    col += BM
+    # MR
+    colorwheel[col:col + MR, 2] = 1. - (torch.arange(0, MR, dtype=torch.float32) / MR)
+    colorwheel[col:col + MR, 0] = 1.
+    return colorwheel
+def is_image_file(filename):
+    IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
+    filename_lower = filename.lower()
+    return any(filename_lower.endswith(extension) for extension in IMG_EXTENSIONS)
+def deprocess(img):
+    img = img.add_(1).div_(2)
+    return img
+# get configs
+def get_config(config):
+    with open(config, 'r') as stream:
+        return yaml.load(stream,Loader=yaml.Loader)
+# Get model list for resume
+def get_model_list(dirname, key, iteration=0):
+    if os.path.exists(dirname) is False:
+        return None
+    gen_models = [os.path.join(dirname, f) for f in os.listdir(dirname) if
+                  os.path.isfile(os.path.join(dirname, f)) and key in f and ".pt" in f]
+    if gen_models is None:
+        return None
+    gen_models.sort()
+    if iteration == 0:
+        last_model_name = gen_models[-1]
+    else:
+        for model_name in gen_models:
+            if '{:0>8d}'.format(iteration) in model_name:
+                return model_name
+        raise ValueError('Not found models with this iteration')
+    return last_model_name
+if __name__ == '__main__':
+    test_random_bbox()
+    mask = test_bbox2mask()
+    print(mask.shape)
+    import matplotlib.pyplot as plt
+    plt.imshow(mask, cmap='gray')
+    plt.show()