AnyV2V

Paused

File size: 2,898 Bytes


from huggingface_hub import hf_hub_download
import torch
import PIL

class CosXLEdit():
    """
    Edit Cos Stable Diffusion XL 1.0 Base is tuned to use a Cosine-Continuous EDM VPred schedule, and then upgraded to perform instructed image editing.
    Reference: https://huggingface.co/stabilityai/cosxl
    """
    def __init__(self, device="cuda"):
        """
        Attributes:
            pipe (CosStableDiffusionXLInstructPix2PixPipeline): The InstructPix2Pix pipeline for image transformation.

        Args:
            device (str, optional): Device on which the pipeline runs. Defaults to "cuda".
        """
        from diffusers import EDMEulerScheduler
        from .cosxl.custom_pipeline import CosStableDiffusionXLInstructPix2PixPipeline
        from .cosxl.utils import set_timesteps_patched

        EDMEulerScheduler.set_timesteps = set_timesteps_patched

        try:
            edit_file = hf_hub_download(repo_id="TIGER-Lab/cosxl", filename="cosxl_edit.safetensors")
            self.pipe = CosStableDiffusionXLInstructPix2PixPipeline.from_single_file(
                edit_file, num_in_channels=8
            )
        except:
            edit_file_path = "./black_box_image_edit/cosxl/cosxl_edit.safetensors"
            self.pipe = CosStableDiffusionXLInstructPix2PixPipeline.from_single_file(
                edit_file_path, num_in_channels=8
            )
        self.pipe.scheduler = EDMEulerScheduler(sigma_min=0.002, sigma_max=120.0, sigma_data=1.0, prediction_type="v_prediction")
        self.pipe.to(device)
        self.pipe.enable_vae_tiling()
        self.pipe.enable_model_cpu_offload()

    def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""):
        """
        Modifies the source image based on the provided instruction prompt.

        Args:
            src_image (PIL.Image.Image): Source image in RGB format.
            instruct_prompt (str): Caption for editing the image.
            seed (int, optional): Seed for random generator. Defaults to 42.

        Returns:
            PIL.Image.Image: The transformed image.
        """
        src_image = src_image.convert('RGB') # force it to RGB format
        generator = torch.manual_seed(seed)

        resolution = 1024
        preprocessed_image = src_image.resize((resolution, resolution))
        image = self.pipe(prompt=instruct_prompt,
                        image=preprocessed_image,
                        height=resolution,
                        width=resolution,
                        negative_prompt=negative_prompt, 
                        guidance_scale=7,
                        num_inference_steps=20,
                        generator=generator).images[0]
        image = image.resize((src_image.width, src_image.height))

        return image