Spaces:

PAIR
/

StreamingSVD

Running on Zero

App Files Files Community

lev1 commited on Sep 7, 2024

Commit

8fd2f2f

1 Parent(s): 0c8ced5

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
README.md +11 -7
__init__.py +0 -0
config.yaml +316 -0
dataloader/dataset_factory.py +13 -0
dataloader/single_image_dataset.py +16 -0
dataloader/video_data_module.py +32 -0
diffusion_trainer/abstract_trainer.py +108 -0
diffusion_trainer/streaming_svd.py +508 -0
gradio_demo.py +214 -0
i2v_enhance/i2v_enhance_interface.py +128 -0
i2v_enhance/pipeline_i2vgen_xl.py +988 -0
i2v_enhance/thirdparty/VFI/Trainer.py +168 -0
i2v_enhance/thirdparty/VFI/ckpt/Put ours.pkl files here.txt +1 -0
i2v_enhance/thirdparty/VFI/ckpt/__init__.py +0 -0
i2v_enhance/thirdparty/VFI/config.py +49 -0
i2v_enhance/thirdparty/VFI/dataset.py +93 -0
i2v_enhance/thirdparty/VFI/model/__init__.py +5 -0
i2v_enhance/thirdparty/VFI/model/feature_extractor.py +516 -0
i2v_enhance/thirdparty/VFI/model/flow_estimation.py +141 -0
i2v_enhance/thirdparty/VFI/model/loss.py +95 -0
i2v_enhance/thirdparty/VFI/model/refine.py +71 -0
i2v_enhance/thirdparty/VFI/model/warplayer.py +21 -0
i2v_enhance/thirdparty/VFI/train.py +105 -0
lib/__init__.py +0 -0
lib/farancia/__init__.py +4 -0
lib/farancia/animation.py +43 -0
lib/farancia/config.py +1 -0
lib/farancia/libimage/__init__.py +45 -0
lib/farancia/libimage/iimage.py +511 -0
lib/farancia/libimage/utils.py +8 -0
models/cam/conditioning.py +150 -0
models/control/controlnet.py +581 -0
models/diffusion/discretizer.py +33 -0
models/diffusion/video_model.py +574 -0
models/diffusion/wrappers.py +78 -0
models/svd/sgm/__init__.py +4 -0
models/svd/sgm/data/__init__.py +1 -0
models/svd/sgm/data/cifar10.py +67 -0
models/svd/sgm/data/dataset.py +80 -0
models/svd/sgm/data/mnist.py +85 -0
models/svd/sgm/inference/api.py +385 -0
models/svd/sgm/inference/helpers.py +305 -0
models/svd/sgm/lr_scheduler.py +135 -0
models/svd/sgm/models/__init__.py +2 -0
models/svd/sgm/models/autoencoder.py +615 -0
models/svd/sgm/models/diffusion.py +341 -0
models/svd/sgm/modules/__init__.py +6 -0
models/svd/sgm/modules/attention.py +809 -0
models/svd/sgm/modules/autoencoding/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 ---
 title: StreamingSVD
-emoji: 🌍
-colorFrom: pink
-colorTo: gray
 sdk: gradio
 sdk_version: 4.43.0
 app_file: app.py
-pinned: false
 license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: StreamingSVD
+emoji: 🎥
+colorFrom: yellow
+colorTo: green
 sdk: gradio
 sdk_version: 4.43.0
+suggested_hardware: a100-large
+suggested_storage: large
 app_file: app.py
 license: mit
+tags:
+  - StreamingSVD
+  - long-video-generation
+  - PAIR
+short_description: Image-to-Video
+disable_embedding: false

__init__.py ADDED Viewed

File without changes

config.yaml ADDED Viewed

	@@ -0,0 +1,316 @@

+# pytorch_lightning==2.2.2
+seed_everything: 33
+trainer:
+  accelerator: auto
+  strategy: auto
+  devices: '1'
+  num_nodes: 1
+  precision: 16-mixed
+  logger: False
+model:
+  class_path: diffusion_trainer.streaming_svd.StreamingSVD
+  init_args:
+    vfi:
+      class_path: modules.params.vfi.VFIParams
+      init_args:
+        ckpt_path_local: checkpoint/VFI/ours.pkl
+        ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing
+    i2v_enhance:
+      class_path:  modules.params.i2v_enhance.I2VEnhanceParams
+      init_args:
+        ckpt_path_local: checkpoint/i2v_enhance/
+        ckpt_path_global: ali-vilab/i2vgen-xl
+    module_loader:
+      class_path: modules.loader.module_loader.GenericModuleLoader
+      init_args:
+        pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt
+        pipeline_obj: streamingt2v_pipeline
+        set_prediction_type: ''
+        module_names:
+        - network_config
+        - model
+        - controlnet
+        - denoiser
+        - conditioner
+        - first_stage_model
+        - sampler
+        - svd_pipeline
+        module_config:
+          controlnet:
+            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
+            init_args:
+              loader_cls_path: models.control.controlnet.ControlNet
+              cls_func: from_unet
+              cls_func_fast_dev_run: ''
+              kwargs_diffusers: null
+              model_params:
+                merging_mode: addition
+                zero_conv_mode: Identity
+                frame_expansion: none
+                downsample_controlnet_cond: true
+                use_image_encoder_normalization: true
+                use_controlnet_mask: false
+                condition_encoder: ''
+                conditioning_embedding_out_channels:
+                - 32
+                - 96
+                - 256
+                - 512
+              kwargs_diff_trainer_params: null
+              args: []
+              dependent_modules:
+                model: model
+              dependent_modules_cloned: null
+              state_dict_path: ''
+              strict_loading: true
+              state_dict_filters: []
+          network_config:
+            class_path: models.diffusion.video_model.VideoUNet
+            init_args:
+              in_channels: 8
+              model_channels: 320
+              out_channels: 4
+              num_res_blocks: 2
+              num_conditional_frames: null
+              attention_resolutions:
+              - 4
+              - 2
+              - 1
+              dropout: 0.0
+              channel_mult:
+              - 1
+              - 2
+              - 4
+              - 4
+              conv_resample: true
+              dims: 2
+              num_classes: sequential
+              use_checkpoint: False
+              num_heads: -1
+              num_head_channels: 64
+              num_heads_upsample: -1
+              use_scale_shift_norm: false
+              resblock_updown: false
+              transformer_depth: 1
+              transformer_depth_middle: null
+              context_dim: 1024
+              time_downup: false
+              time_context_dim: null
+              extra_ff_mix_layer: true
+              use_spatial_context: true
+              merge_strategy: learned_with_images
+              merge_factor: 0.5
+              spatial_transformer_attn_type: softmax-xformers
+              video_kernel_size:
+              - 3
+              - 1
+              - 1
+              use_linear_in_transformer: true
+              adm_in_channels: 768
+              disable_temporal_crossattention: false
+              max_ddpm_temb_period: 10000
+              merging_mode: attention_cross_attention
+              controlnet_mode: true
+              use_apm: false
+          model:
+            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
+            init_args:
+              loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper
+              cls_func: ''
+              cls_func_fast_dev_run: ''
+              kwargs_diffusers:
+                compile_model: false
+              model_params: null
+              model_params_fast_dev_run: null
+              kwargs_diff_trainer_params: null
+              args: []
+              dependent_modules:
+                diffusion_model: network_config
+              dependent_modules_cloned: null
+              state_dict_path: ''
+              strict_loading: true
+              state_dict_filters: []
+          denoiser:
+            class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser
+            init_args:
+              scaling_config:
+                target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+          sampler:
+            class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+            init_args:
+              s_churn: 0.0
+              s_tmin: 0.0
+              s_tmax: .inf
+              s_noise: 1.0
+              discretization_config:
+                target: models.diffusion.discretizer.AlignYourSteps
+                params:
+                  sigma_max: 700.0
+              num_steps: 30
+              guider_config:
+                target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+                params:
+                  max_scale: 3.0
+                  min_scale: 1.5
+                  num_frames: 25
+              verbose: false
+              device: cuda
+          conditioner:
+            class_path: models.svd.sgm.modules.GeneralConditioner
+            init_args:
+              emb_models:
+              - is_trainable: false
+                input_key: cond_frames_without_noise
+                target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+                params:
+                  n_cond_frames: 1
+                  n_copies: 1
+                  open_clip_embedding_config:
+                    target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+                    params:
+                      freeze: true
+              - input_key: fps_id
+                is_trainable: false
+                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+                params:
+                  outdim: 256
+              - input_key: motion_bucket_id
+                is_trainable: false
+                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+                params:
+                  outdim: 256
+              - input_key: cond_frames
+                is_trainable: false
+                target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+                params:
+                  disable_encoder_autocast: true
+                  n_cond_frames: 1
+                  n_copies: 1
+                  is_ae: true
+                  encoder_config:
+                    target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly
+                    params:
+                      embed_dim: 4
+                      monitor: val/rec_loss
+                      ddconfig:
+                        attn_type: vanilla-xformers
+                        double_z: true
+                        z_channels: 4
+                        resolution: 256
+                        in_channels: 3
+                        out_ch: 3
+                        ch: 128
+                        ch_mult:
+                        - 1
+                        - 2
+                        - 4
+                        - 4
+                        num_res_blocks: 2
+                        attn_resolutions: []
+                        dropout: 0.0
+                      lossconfig:
+                        target: torch.nn.Identity
+              - input_key: cond_aug
+                is_trainable: false
+                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+                params:
+                  outdim: 256
+          first_stage_model:
+            class_path: models.svd.sgm.AutoencodingEngine
+            init_args:
+              encoder_config:
+                target: models.svd.sgm.modules.diffusionmodules.model.Encoder
+                params:
+                  attn_type: vanilla
+                  double_z: true
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult:
+                  - 1
+                  - 2
+                  - 4
+                  - 4
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+              decoder_config:
+                target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder
+                params:
+                  attn_type: vanilla
+                  double_z: true
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult:
+                  - 1
+                  - 2
+                  - 4
+                  - 4
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                  video_kernel_size:
+                  - 3
+                  - 1
+                  - 1
+              loss_config:
+                target: torch.nn.Identity
+              regularizer_config:
+                target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+              optimizer_config: null
+              lr_g_factor: 1.0
+              trainable_ae_params: null
+              ae_optimizer_args: null
+              trainable_disc_params: null
+              disc_optimizer_args: null
+              disc_start_iter: 0
+              diff_boost_factor: 3.0
+              ckpt_engine: null
+              ckpt_path: null
+              additional_decode_keys: null
+              ema_decay: null
+              monitor: null
+              input_key: jpg
+          svd_pipeline:
+            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
+            init_args:
+              loader_cls_path: diffusers.StableVideoDiffusionPipeline
+              cls_func: from_pretrained
+              cls_func_fast_dev_run: ''
+              kwargs_diffusers:
+                torch_dtype: torch.float16
+                variant: fp16
+                use_safetensors: true
+              model_params: null
+              model_params_fast_dev_run: null
+              kwargs_diff_trainer_params: null
+              args:
+              - stabilityai/stable-video-diffusion-img2vid-xt
+              dependent_modules: null
+              dependent_modules_cloned: null
+              state_dict_path: ''
+              strict_loading: true
+              state_dict_filters: []
+        root_cls: null
+    diff_trainer_params:
+      class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams
+      init_args:
+        scale_factor: 0.18215
+        streamingsvd_ckpt:
+          class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor
+          init_args:
+            ckpt_path_local: checkpoint/StreamingSVD/model.safetensors
+            ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors
+        disable_first_stage_autocast: true
+    inference_params:
+      class_path: modules.params.diffusion.inference_params.T2VInferenceParams
+      init_args:
+        n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD
+        num_conditional_frames: 7 # is this used?
+        anchor_frames: '6'  #  Take the (Number+1)th frame as CLIP encoding for StreamingSVD
+        reset_seed_per_generation: true # If true, the seed is reset on every generation

dataloader/dataset_factory.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pathlib import Path
+from torch.utils.data import Dataset
+from dataloader.single_image_dataset import SingleImageDataset
+class SingleImageDatasetFactory():
+    def __init__(self, file: Path):
+        self.data_path = file
+    def get_dataset(self, max_samples: int = None) -> Dataset:
+        return SingleImageDataset(file=self.data_path)

dataloader/single_image_dataset.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+import numpy as np
+from torch.utils.data import Dataset
+class SingleImageDataset(Dataset):
+    def __init__(self, file: np.ndarray):
+        super().__init__()
+        self.images = [file]
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        return {"image": self.images[index], "sample_id": torch.tensor(index, dtype=torch.int64)}

dataloader/video_data_module.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.utilities.types import (EVAL_DATALOADERS)
+from dataloader.dataset_factory import SingleImageDatasetFactory
+class VideoDataModule(pl.LightningDataModule):
+    def __init__(self,
+                 workers: int,
+                 predict_dataset_factory: SingleImageDatasetFactory = None,
+                 ) -> None:
+        super().__init__()
+        self.num_workers = workers
+        self.video_data_module = {}
+        # TODO read size from loaded unet via unet.sample_sizes
+        self.predict_dataset_factory = predict_dataset_factory
+    def setup(self, stage: str) -> None:
+        if stage == "predict":
+            self.video_data_module["predict"] = self.predict_dataset_factory.get_dataset(
+            )
+    def predict_dataloader(self) -> EVAL_DATALOADERS:
+        return torch.utils.data.DataLoader(self.video_data_module["predict"],
+                                           batch_size=1,
+                                           pin_memory=True,
+                                           num_workers=self.num_workers,
+                                           collate_fn=None,
+                                           shuffle=False,
+                                           drop_last=False)

diffusion_trainer/abstract_trainer.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import pytorch_lightning as pl
+import torch
+from typing import Any
+from modules.params.diffusion.inference_params import InferenceParams
+from modules.loader.module_loader import GenericModuleLoader
+from modules.params.diffusion_trainer.params_streaming_diff_trainer import DiffusionTrainerParams
+class AbstractTrainer(pl.LightningModule):
+    def __init__(self,
+                 inference_params: Any,
+                 diff_trainer_params: DiffusionTrainerParams,
+                 module_loader: GenericModuleLoader,
+                 ):
+        super().__init__()
+        self.inference_params = inference_params
+        self.diff_trainer_params = diff_trainer_params
+        self.module_loader = module_loader
+        self.on_start_once_called = False
+        self._setup_methods = []
+        module_loader(
+            trainer=self,
+            diff_trainer_params=diff_trainer_params)
+    # ------ IMPLEMENTATION HOOKS -------
+    def post_init(self, batch):
+        '''
+        Is called after LightningDataModule and LightningModule is created, but before any training/validation/prediction.
+        First possible access to the 'trainer' object (e.g. to get 'device').
+        '''
+    def generate_output(self, batch, batch_idx, inference_params: InferenceParams):
+        '''
+        Is called during validation to generate for each batch an output.
+        Return the meta information about produced result (where result were stored).
+        This is used for the metric evaluation.
+        '''
+    # ------- HELPER FUNCTIONS -------
+    def _reset_random_generator(self):
+        '''
+        Reset the random generator to the same seed across all workers. The generator is used only for inference.
+        '''
+        if not hasattr(self, "random_generator"):
+            self.random_generator = torch.Generator(device=self.device)
+            # set seed according to 'seed_everything' in config
+            seed = int(os.environ.get("PL_GLOBAL_SEED", 42))
+        else:
+            seed = self.random_generator.initial_seed()
+        self.random_generator.manual_seed(seed)
+    # ----- PREDICT HOOKS ------
+    def on_predict_start(self):
+        self.on_start()
+    def predict_step(self, batch, batch_idx):
+        self.on_inference_step(batch=batch, batch_idx=batch_idx)
+    def on_predict_epoch_start(self):
+        self.on_inference_epoch_start()
+    # -----   CUSTOM HOOKS -----
+    # Global Hooks (Called by Training, Validation and Prediction)
+    # abstract method
+    def _on_start_once(self):
+        '''
+        Will be called only once by on_start. Thus, it will be called by the first call of train,validation or prediction.
+        '''
+        if self.on_start_once_called:
+            return
+        else:
+            self.on_start_once_called = True
+        self.post_init()
+    def on_start(self):
+        '''
+        Called at the beginning of training, validation and prediction.
+        '''
+        self._on_start_once()
+    # Inference Hooks (Called by Validation and Prediction)
+     # ----- Inference Hooks (called by 'validation' and 'predict') ------
+    def on_inference_epoch_start(self):
+        # reset seed at every inference
+        self._reset_random_generator()
+    def on_inference_step(self, batch, batch_idx):
+        if self.inference_params.reset_seed_per_generation:
+            self._reset_random_generator()
+        self.generate_output(
+            batch=batch, inference_params=self.inference_params, batch_idx=batch_idx)

diffusion_trainer/streaming_svd.py ADDED Viewed

	@@ -0,0 +1,508 @@

+from modules.loader.module_loader import GenericModuleLoader
+from modules.params.diffusion_trainer.params_streaming_diff_trainer import DiffusionTrainerParams
+import torch
+from modules.params.diffusion.inference_params import InferenceParams
+from utils import result_processor
+from modules.loader.module_loader import GenericModuleLoader
+from tqdm import tqdm
+from PIL import Image, ImageFilter
+from utils.inference_utils import resize_and_crop,get_padding_for_aspect_ratio
+import numpy as np
+from safetensors.torch import load_file as load_safetensors
+import math
+from einops import repeat, rearrange
+from torchvision.transforms import ToTensor
+from models.svd.sgm.modules.autoencoding.temporal_ae import VideoDecoder
+import PIL
+from modules.params.vfi import VFIParams
+from modules.params.i2v_enhance import I2VEnhanceParams
+from typing import List,Union
+from models.diffusion.wrappers import StreamingWrapper
+from diffusion_trainer.abstract_trainer import AbstractTrainer
+from utils.loader import download_ckpt
+import torchvision.transforms.functional as TF
+from diffusers import AutoPipelineForInpainting, DEISMultistepScheduler
+from transformers import BlipProcessor, BlipForConditionalGeneration
+class StreamingSVD(AbstractTrainer):
+    def __init__(self,
+                 module_loader: GenericModuleLoader,
+                 diff_trainer_params: DiffusionTrainerParams,
+                 inference_params: InferenceParams,
+                 vfi: VFIParams,
+                 i2v_enhance: I2VEnhanceParams,
+                 ):
+        super().__init__(inference_params=inference_params,
+                         diff_trainer_params=diff_trainer_params,
+                         module_loader=module_loader,
+                         )
+        # network config is wrapped by OpenAIWrapper, so we dont need a direct reference anymore
+        # this corresponds to the config yaml defined at model.module_loader.module_config.model.dependent_modules
+        del self.network_config
+        self.diff_trainer_params: DiffusionTrainerParams
+        self.vfi = vfi
+        self.i2v_enhance = i2v_enhance
+    def on_inference_epoch_start(self):
+        super().on_inference_epoch_start()
+        # for StreamingSVD we use a model wrapper that combines the base SVD model and the control model.
+        self.inference_model = StreamingWrapper(
+            diffusion_model=self.model.diffusion_model,
+            controlnet=self.controlnet,
+            num_frame_conditioning=self.inference_params.num_conditional_frames
+        )
+    def post_init(self):
+        self.svd_pipeline.set_progress_bar_config(disable=True)
+        if self.device.type != "cpu":
+            self.svd_pipeline.enable_model_cpu_offload(gpu_id = self.device.index)
+        # re-use the open clip already loaded for image conditioner for image_encoder_apm
+        embedders = self.conditioner.embedders
+        for embedder in embedders:
+            if hasattr(embedder,"input_key") and embedder.input_key == "cond_frames_without_noise":
+                self.image_encoder_apm = embedder.open_clip
+        self.first_stage_model.to("cpu")
+        self.conditioner.embedders[3].encoder.to("cpu")
+        self.conditioner.embedders[0].open_clip.to("cpu")
+        pipe = AutoPipelineForInpainting.from_pretrained(
+            'Lykon/dreamshaper-8-inpainting', torch_dtype=torch.float16, variant="fp16", safety_checker=None, requires_safety_checker=False)
+        pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(self.device)
+        pipe.enable_model_cpu_offload(gpu_id = self.device.index)
+        self.inpaint_pipe = pipe
+        processor = BlipProcessor.from_pretrained(
+            "Salesforce/blip-image-captioning-large")
+        model = BlipForConditionalGeneration.from_pretrained(
+            "Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to(self.device)
+        def blip(x): return processor.decode(model.generate(** processor(x,
+                                                                         return_tensors='pt').to("cuda", torch.float16))[0], skip_special_tokens=True)
+        self.blip = blip
+    # Adapted from https://github.com/Stability-AI/generative-models/blob/main/scripts/sampling/simple_video_sample.py
+    def get_unique_embedder_keys_from_conditioner(self, conditioner):
+        return list(set([x.input_key for x in conditioner.embedders]))
+    # Adapted from https://github.com/Stability-AI/generative-models/blob/main/scripts/sampling/simple_video_sample.py
+    def get_batch_sgm(self, keys, value_dict, N, T, device):
+        batch = {}
+        batch_uc = {}
+        for key in keys:
+            if key == "fps_id":
+                batch[key] = (
+                    torch.tensor([value_dict["fps_id"]])
+                    .to(device)
+                    .repeat(int(math.prod(N)))
+                )
+            elif key == "motion_bucket_id":
+                batch[key] = (
+                    torch.tensor([value_dict["motion_bucket_id"]])
+                    .to(device)
+                    .repeat(int(math.prod(N)))
+                )
+            elif key == "cond_aug":
+                batch[key] = repeat(
+                    torch.tensor([value_dict["cond_aug"]]).to(device),
+                    "1 -> b",
+                    b=math.prod(N),
+                )
+            elif key == "cond_frames":
+                batch[key] = repeat(value_dict["cond_frames"],
+                                    "1 ... -> b ...", b=N[0])
+            elif key == "cond_frames_without_noise":
+                batch[key] = repeat(
+                    value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
+                )
+            else:
+                batch[key] = value_dict[key]
+        if T is not None:
+            batch["num_video_frames"] = T
+        for key in batch.keys():
+            if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+                batch_uc[key] = torch.clone(batch[key])
+        return batch, batch_uc
+    # Adapted from https://github.com/Stability-AI/generative-models/blob/main/sgm/models/diffusion.py
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        self.first_stage_model.to(self.device)
+        z = 1.0 / self.diff_trainer_params.scale_factor * z
+        #n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+        n_samples = min(z.shape[0],8)
+        #print("SVD decoder started")
+        import time
+        start = time.time()
+        n_rounds = math.ceil(z.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.diff_trainer_params.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                    kwargs = {"timesteps": len(
+                        z[n * n_samples: (n + 1) * n_samples])}
+                else:
+                    kwargs = {}
+                out = self.first_stage_model.decode(
+                    z[n * n_samples: (n + 1) * n_samples], **kwargs
+                )
+                all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        # print(f"SVD decoder finished after {time.time()-start} seconds.")
+        self.first_stage_model.to("cpu")
+        return out
+    # Adapted from https://github.com/Stability-AI/generative-models/blob/main/scripts/sampling/simple_video_sample.py
+    def _generate_conditional_output(self, svd_input_frame, inference_params: InferenceParams, **params):
+        C = 4
+        F = 8 # spatial compression TODO read from model
+        H = svd_input_frame.shape[-2]
+        W = svd_input_frame.shape[-1]
+        num_frames = self.sampler.guider.num_frames
+        shape = (num_frames, C, H // F, W // F)
+        batch_size = 1
+        image = svd_input_frame[None,:]
+        cond_aug = 0.02
+        value_dict = {}
+        value_dict["motion_bucket_id"] = 127
+        value_dict["fps_id"] = 6
+        value_dict["cond_aug"] = cond_aug
+        value_dict["cond_frames_without_noise"] = image
+        value_dict["cond_frames"] =image + cond_aug * torch.rand_like(image)
+        batch, batch_uc = self.get_batch_sgm(
+            self.get_unique_embedder_keys_from_conditioner(
+                self.conditioner),
+            value_dict,
+            [1, num_frames],
+            T=num_frames,
+            device=self.device,
+        )
+        self.conditioner.embedders[3].encoder.to(self.device)
+        self.conditioner.embedders[0].open_clip.to(self.device)
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            batch_uc=batch_uc,
+            force_uc_zero_embeddings=[
+                "cond_frames",
+                "cond_frames_without_noise",
+            ],
+        )
+        self.conditioner.embedders[3].encoder.to("cpu")
+        self.conditioner.embedders[0].open_clip.to("cpu")
+        for k in ["crossattn", "concat"]:
+            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+        randn = torch.randn(shape, device=self.device)
+        additional_model_inputs = {}
+        additional_model_inputs["image_only_indicator"] = torch.zeros(2*batch_size,num_frames).to(self.device)
+        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+        # StreamingSVD inputs
+        additional_model_inputs["batch_size"] = 2*batch_size
+        additional_model_inputs["num_conditional_frames"] = self.inference_params.num_conditional_frames
+        additional_model_inputs["ctrl_frames"] = params["ctrl_frames"]
+        self.inference_model.diffusion_model = self.inference_model.diffusion_model.to(
+            self.device)
+        self.inference_model.controlnet = self.inference_model.controlnet.to(
+            self.device)
+        c["vector"] = c["vector"].to(randn.dtype)
+        uc["vector"] = uc["vector"].to(randn.dtype)
+        def denoiser(input, sigma, c):
+            return self.denoiser(self.inference_model,input,sigma,c, **additional_model_inputs)
+        samples_z = self.sampler(denoiser,randn,cond=c,uc=uc)
+        self.inference_model.diffusion_model = self.inference_model.diffusion_model.to( "cpu")
+        self.inference_model.controlnet = self.inference_model.controlnet.to("cpu")
+        samples_x = self.decode_first_stage(samples_z)
+        samples = torch.clamp(samples_x,min=-1.0,max=1.0)
+        return samples
+    def extract_anchor_frames(self, video, input_range,inference_params: InferenceParams):
+        """
+        Extracts anchor frames from the input video based on the provided inference parameters.
+        Parameters:
+        - video: torch.Tensor
+            The input video tensor.
+        - input_range: list
+            The pixel value range of input video.
+        - inference_params: InferenceParams
+            An object containing inference parameters.
+            - anchor_frames: str
+                Specifies how the anchor frames are encoded. It can be either a single number specifying which frame is used as the anchor frame,
+                or a range in the format "a:b" indicating that frames from index a up to index b (inclusive) are used as anchor frames.
+        Returns:
+        - torch.Tensor
+            The extracted anchor frames from the input video.
+        """
+        video = result_processor.convert_range(video=video.clone(),input_range=input_range,output_range=[-1,1])
+        if video.shape[1] == 3 and video.shape[0]>3:
+            video = rearrange(video,"F C W H -> 1 F C W H")
+        elif video.shape[0]>3 and video.shape[-1] == 3:
+            video = rearrange(video,"F W H C -> 1 F C W H")
+        else:
+            raise NotImplementedError(f"Unexpected video input format: {video.shape}")
+        if ":" in inference_params.anchor_frames:
+            anchor_frames = inference_params.anchor_frames.split(":")
+            anchor_frames = [int(anchor_frame) for anchor_frame in anchor_frames]
+            assert len(anchor_frames) == 2,"Anchor frames encoding wrong."
+            anchor = video[:,anchor_frames[0]:anchor_frames[1]]
+        else:
+            anchor_frame = int(inference_params.anchor_frames)
+            anchor = video[:, anchor_frame].unsqueeze(0)
+        return anchor
+    def extract_ctrl_frames(self,video: torch.FloatType, input_range: List[int], inference_params: InferenceParams):
+        """
+        Extracts control frames from the input video.
+        Parameters:
+        - video: torch.Tensor
+            The input video tensor.
+        - input_range: list
+            The pixel value range of input video.
+        - inference_params: InferenceParams
+            An object containing inference parameters.
+        Returns:
+        - torch.Tensor
+            The extracted control image encoding frames from the input video.
+        """
+        video = result_processor.convert_range(video=video.clone(), input_range=input_range, output_range=[-1, 1])
+        if video.shape[1] == 3 and video.shape[0] > 3:
+            video = rearrange(video, "F C W H -> 1 F C W H")
+        elif video.shape[0] > 3 and video.shape[-1] == 3:
+            video = rearrange(video, "F W H C -> 1 F C W H")
+        else:
+            raise NotImplementedError(
+                f"Unexpected video input format: {video.shape}")
+        # return the last num_conditional_frames frames
+        video = video[:, -inference_params.num_conditional_frames:]
+        return video
+    def _autoregressive_generation(self,initial_generation: Union[torch.FloatType,List[torch.FloatType]], inference_params:InferenceParams):
+        """
+        Perform autoregressive generation of video chunks based on the initial generation and inference parameters.
+        Parameters:
+        - initial_generation: torch.Tensor or list of torch.Tensor
+            The initial generation or list of initial generation video chunks.
+        - inference_params: InferenceParams
+            An object containing inference parameters.
+        Returns:
+        - torch.Tensor
+            The generated video resulting from autoregressive generation.
+        """
+        # input is [-1,1] float
+        result_chunks = initial_generation
+        if not isinstance(result_chunks,list):
+            result_chunks = [result_chunks]
+        # make sure
+        if (result_chunks[0].shape[1] >3) and (result_chunks[0].shape[-1] == 3):
+            result_chunks = [rearrange(result_chunks[0],"F W H C -> F C W H")]
+        # generating chunk by conditioning on the previous chunks
+        for _ in tqdm(list(range(inference_params.n_autoregressive_generations)),desc="StreamingSVD"):
+            # extract anchor frames based on the entire, so far generated, video
+            # note that we do note use anchor frame in StreamingSVD (apart from the anchor frame already used by SVD).
+            anchor_frames = self.extract_anchor_frames(
+                video = torch.cat(result_chunks),
+                inference_params=inference_params,
+                input_range=[-1, 1],
+                )
+            # extract control frames based on the last generated chunk
+            ctrl_frames = self.extract_ctrl_frames(
+                video = result_chunks[-1],
+                input_range=[-1, 1],
+                inference_params=inference_params,
+                )
+            # select the anchor frame for svd
+            svd_input_frame = result_chunks[0][int(inference_params.anchor_frames)]
+            # generate the next chunk
+            # result is [F, C, H, W], range is [-1,1] float.
+            result = self._generate_conditional_output(
+                                                      svd_input_frame = svd_input_frame,
+                                                      inference_params=inference_params,
+                                                      anchor_frames=anchor_frames,
+                                                      ctrl_frames=ctrl_frames,
+                                                      )
+            # from each generation, we keep all frames except for the first <num_conditional_frames> frames
+            result = result[inference_params.num_conditional_frames:]
+            result_chunks.append(result)
+            torch.cuda.empty_cache()
+        # concat all chunks to one long video
+        result_chunks = [result_processor.convert_range(chunk,output_range=[0,255],input_range=[-1,1]) for chunk in result_chunks]
+        result = result_processor.concat_chunks(result_chunks)
+        torch.cuda.empty_cache()
+        return result
+    def ensure_image_ratio(self,source_image: PIL,target_aspect_ratio = 16/9):
+        if source_image.width / source_image.height == target_aspect_ratio:
+            return source_image, None
+        image = source_image.copy().convert("RGBA")
+        mask = image.split()[-1]
+        image = image.convert("RGB")
+        padding = get_padding_for_aspect_ratio(image)
+        mask_padded = TF.pad(mask, padding)
+        mask_padded_size = mask_padded.size
+        mask_padded_resized = TF.resize(mask_padded, (512, 512),
+                                        interpolation=TF.InterpolationMode.NEAREST)
+        mask_padded_resized = TF.invert(mask_padded_resized)
+        # image
+        padded_input_image = TF.pad(image, padding, padding_mode="reflect")
+        resized_image = TF.resize(padded_input_image, (512, 512))
+        image_tensor = (self.inpaint_pipe.image_processor.preprocess(
+            resized_image).cuda().half())
+        latent_tensor = self.inpaint_pipe._encode_vae_image(image_tensor, None)
+        self.inpaint_pipe.scheduler.set_timesteps(999)
+        noisy_latent_tensor = self.inpaint_pipe.scheduler.add_noise(
+            latent_tensor,
+            torch.randn_like(latent_tensor),
+            self.inpaint_pipe.scheduler.timesteps[:1],
+        )
+        prompt = self.blip(source_image)
+        if prompt.startswith("there is "):
+            prompt = prompt[len("there is "):]
+        output_image_normalized_size = self.inpaint_pipe(
+            prompt=prompt,
+            image=resized_image,
+            mask_image=mask_padded_resized,
+            latents=noisy_latent_tensor,
+        ).images[0]
+        output_image_extended_size = TF.resize(
+            output_image_normalized_size, mask_padded_size[::-1])
+        blured_outpainting_mask = TF.invert(mask_padded).filter(
+            ImageFilter.GaussianBlur(radius=5))
+        final_image = Image.composite(
+            output_image_extended_size, padded_input_image, blured_outpainting_mask)
+        return final_image, TF.invert(mask_padded)
+    def image_to_video(self, batch, inference_params: InferenceParams, batch_idx):
+        """
+        Performs image to video based on the input batch and inference parameters.
+        It runs SVD-XT one to generate the first chunk, then auto-regressively applies StreamingSVD.
+        Parameters:
+        - batch: dict
+            The input batch containing the start image for generating the video.
+        - inference_params: InferenceParams
+            An object containing inference parameters.
+        - batch_idx: int
+            The index of the batch.
+        Returns:
+        - torch.Tensor
+            The generated video based on the image image.
+        """
+        batch_key = "image"
+        assert batch_key == "image", f"Generating video from {batch_key} not implemented."
+        input_image = PIL.Image.fromarray(batch[batch_key][0].cpu().numpy())
+        # TODO remove conversion forth and back
+        outpainted_image, _ = self.ensure_image_ratio(input_image)
+        #image = Image.fromarray(np.uint8(image))
+        '''
+        if image.width/image.height != 16/9:
+            print(f"Warning! For best results, we assume the aspect ratio of the input image to be 16:9. Found ratio {image.width}:{image.height}.")
+        '''
+        scaled_outpainted_image, expanded_size = resize_and_crop(outpainted_image)
+        assert scaled_outpainted_image.width == 1024 and scaled_outpainted_image.height == 576, f"Wrong shape for file {batch[batch_key]} with shape {scaled_outpainted_image.width}:{scaled_outpainted_image.height}."
+        # Generating first chunk
+        with torch.autocast(device_type="cuda",enabled=False):
+            video_chunks = self.svd_pipeline(
+                scaled_outpainted_image, decode_chunk_size=8).frames[0]
+        video_chunks = torch.stack([ToTensor()(frame) for frame in video_chunks])
+        video_chunks = video_chunks * 2.0 - 1 # [-1,1], float
+        video_chunks = video_chunks.to(self.device)
+        video = self._autoregressive_generation(
+                                                initial_generation=video_chunks,
+                                                inference_params=inference_params)
+        return video, scaled_outpainted_image, expanded_size
+    def generate_output(self, batch, batch_idx,inference_params: InferenceParams):
+        """
+        Generate output video based on the input batch and inference parameters.
+        Parameters:
+        - batch: dict
+            The input batch containing data for generating the output video.
+        - batch_idx: int
+            The index of the batch.
+        - inference_params: InferenceParams
+            An object containing inference parameters.
+        Returns:
+        - torch.Tensor
+            The generated video. Note the result is also accessible via self.trainer.generated_video
+        """
+        sample_id = batch["sample_id"].item()
+        video, scaled_outpainted_image, expanded_size = self.image_to_video(
+            batch, inference_params=inference_params, batch_idx=sample_id)
+        self.trainer.generated_video = video.numpy()
+        self.trainer.expanded_size = expanded_size
+        self.trainer.scaled_outpainted_image = scaled_outpainted_image
+        return video

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import gradio as gr
+from utils.gradio_utils import *
+import argparse
+GRADIO_CACHE = ""
+parser = argparse.ArgumentParser()
+parser.add_argument('--public_access', action='store_true')
+args = parser.parse_args()
+streaming_svd = StreamingSVD(load_argv=False)
+on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
+examples = [
+            ["Experience the dance of jellyfish: float through mesmerizing swarms of jellyfish, pulsating with otherworldly grace and beauty.",
+             "200 - frames (recommended)", 33, None, None],
+            ["Dive into the depths of the ocean: explore vibrant coral reefs, mysterious underwater caves, and the mesmerizing creatures that call the sea home.",
+             "200 - frames (recommended)", 33, None, None],
+            ["A cute cat.",
+             "200 - frames (recommended)", 33, None, None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test1.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test2.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test3.png", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test4.png", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test5.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test6.png", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test7.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test8.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test9.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test10.jpg", None],
+            ["",
+             "200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test11.jpg", None],
+           ]
+def generate(prompt, num_frames, seed, image: np.ndarray):
+    if num_frames == [] or num_frames is None:
+        num_frames = 50
+    else:
+        num_frames = int(num_frames.split(" ")[0])
+        if num_frames > 200:  # and on_huggingspace:
+            num_frames = 200
+    if image is None:
+        image = text_to_image_gradio(
+            prompt=prompt, streaming_svd=streaming_svd, seed=seed)
+    video_file_stage_one = image_to_video_vfi_gradio(
+        img=image, num_frames=num_frames, streaming_svd=streaming_svd, seed=seed, gradio_cache=GRADIO_CACHE)
+    expanded_size, orig_size, scaled_outpainted_image = retrieve_intermediate_data(video_file_stage_one)
+    video_file_stage_two = enhance_video_vfi_gradio(
+        img=scaled_outpainted_image, video=video_file_stage_one.replace("__cropped__", "__expanded__"), num_frames=24, streaming_svd=streaming_svd, seed=seed, expanded_size=expanded_size, orig_size=orig_size, gradio_cache=GRADIO_CACHE)
+    return image, video_file_stage_one, video_file_stage_two
+def enhance(prompt, num_frames, seed, image: np.ndarray, video:str):
+    if num_frames == [] or num_frames is None:
+        num_frames = 50
+    else:
+        num_frames = int(num_frames.split(" ")[0])
+        if num_frames > 200:  # and on_huggingspace:
+            num_frames = 200
+    # User directly applied Long Video Generation (without preview) with Flux.
+    if image is None:
+        image = text_to_image_gradio(
+            prompt=prompt, streaming_svd=streaming_svd, seed=seed)
+    # User directly applied Long Video Generation (without preview) with or without Flux.
+    if video is None:
+        video = image_to_video_gradio(
+            img=image, num_frames=(num_frames+1) // 2, streaming_svd=streaming_svd, seed=seed, gradio_cache=GRADIO_CACHE)
+    expanded_size, orig_size, scaled_outpainted_image = retrieve_intermediate_data(video)
+    # Here the video is path and image is numpy array
+    video_file_stage_two = enhance_video_vfi_gradio(
+        img=scaled_outpainted_image, video=video.replace("__cropped__", "__expanded__"), num_frames=num_frames, streaming_svd=streaming_svd, seed=seed, expanded_size=expanded_size, orig_size=orig_size, gradio_cache=GRADIO_CACHE)
+    return image, video_file_stage_two
+with gr.Blocks() as demo:
+    GRADIO_CACHE = demo.GRADIO_CACHE
+    gr.HTML("""
+            <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+            <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
+                <a href="https://github.com/Picsart-AI-Research/StreamingT2V" style="color:blue;">StreamingSVD</a>
+            </h1>
+            <h2 style="font-weight: 650; font-size: 2rem; margin: 0rem">
+                A StreamingT2V method for high-quality long video generation
+            </h2>
+            <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+            Roberto Henschel<sup>1*</sup>, Levon Khachatryan<sup>1*</sup>, Daniil Hayrapetyan<sup>1*</sup>, Hayk Poghosyan<sup>1</sup>, Vahram Tadevosyan<sup>1</sup>, Zhangyang Wang<sup>1,2</sup>, Shant Navasardyan<sup>1</sup>, <a href="https://www.humphreyshi.com/" style="color:blue;">Humphrey Shi</a><sup>1,3</sup>
+            </h2>
+            <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+            <sup>1</sup>Picsart AI Resarch (PAIR), <sup>2</sup>UT Austin, <sup>3</sup>SHI Labs @ Georgia Tech, Oregon & UIUC
+            </h2>
+            <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+            *Equal Contribution
+            </h2>
+            <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+            [<a href="https://arxiv.org/abs/2403.14773" style="color:blue;">arXiv</a>]
+            [<a href="https://github.com/Picsart-AI-Research/StreamingT2V" style="color:blue;">GitHub</a>]
+            </h2>
+            <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
+            <b>StreamingSVD</b> is an advanced autoregressive technique for text-to-video and image-to-video generation,
+            generating long hiqh-quality videos with rich motion dynamics, turning SVD into a long video generator.
+            Our method ensures temporal consistency throughout the video, aligns closely to the input text/image,
+            and maintains high frame-level image quality. Our demonstrations include successful examples of videos
+            up to 200 frames, spanning 8 seconds, and can be extended for even longer durations.
+            </h2>
+            </div>
+            """)
+    if on_huggingspace:
+        gr.HTML("""
+                <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
+                <br/>
+                <a href="https://huggingface.co/spaces/PAIR/StreamingT2V?duplicate=true">
+                <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+                </p>""")
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        num_frames = gr.Dropdown(["50 - frames (recommended)", "80 - frames (recommended)", "140 - frames (recommended)", "200 - frames (recommended)", "500 - frames", "1000 - frames", "10000 - frames"],
+                                                 label="Number of Video Frames", info="For >200 frames use local workstation!", value="50 - frames (recommended)")
+                    with gr.Row():
+                        prompt_stage1 = gr.Textbox(label='Text-to-Video (Enter text prompt here)',
+                                                   interactive=True, max_lines=1)
+                    with gr.Row():
+                        image_stage1 = gr.Image(label='Image-to-Video (Upload Image here, text prompt will be ignored for I2V if entered)',
+                                                show_label=True, show_download_button=True, interactive=True, height=250)
+                    with gr.Column():
+                        video_stage1 = gr.Video(label='Long Video Preview', show_label=True,
+                                                interactive=False, show_download_button=True, height=203)
+                    with gr.Row():
+                        run_button_stage1 = gr.Button("Long Video Generation (faster preview)")
+            with gr.Row():
+                with gr.Column():
+                    with gr.Accordion('Advanced options', open=False):
+                        seed = gr.Slider(label='Seed', minimum=0,
+                                         maximum=65536, value=33, step=1,)
+        with gr.Column(scale=3):
+            with gr.Row():
+                video_stage2 = gr.Video(label='High-Quality Long Video (Preview or Full)', show_label=True,
+                                        interactive=False, show_download_button=True, height=700)
+            with gr.Row():
+                run_button_stage2 = gr.Button("Long Video Generation (full high-quality)")
+    inputs_t2v = [prompt_stage1, num_frames,
+                  seed, image_stage1]
+    inputs_v2v = [prompt_stage1, num_frames, seed,
+                  image_stage1, video_stage1]
+    run_button_stage1.click(fn=generate, inputs=inputs_t2v,
+                            outputs=[image_stage1, video_stage1, video_stage2])
+    run_button_stage2.click(fn=enhance, inputs=inputs_v2v,
+                            outputs=[image_stage1, video_stage2])
+    gr.Examples(examples=examples,
+                inputs=inputs_v2v,
+                outputs=[image_stage1, video_stage2],
+                fn=enhance,
+                cache_examples=True,
+                run_on_click=False,
+                )
+    '''
+    '''
+    gr.HTML("""
+            <div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
+            <h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
+            <b>Version: v1.0</b>
+            </h3>
+            <h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
+            <b>Caution</b>:
+            We would like the raise the awareness of users of this demo of its potential issues and concerns.
+            Like previous large foundation models, StreamingSVD could be problematic in some cases, partially we use pretrained ModelScope, therefore StreamingSVD can Inherit Its Imperfections.
+            So far, we keep all features available for research testing both to show the great potential of the StreamingSVD framework and to collect important feedback to improve the model in the future.
+            We welcome researchers and users to report issues with the HuggingFace community discussion feature or email the authors.
+            </h3>
+            <h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
+            <b>Biases and content acknowledgement</b>:
+            Beware that StreamingSVD may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography, and violence.
+            StreamingSVD in this demo is meant only for research purposes.
+            </h3>
+            </div>
+            """)
+if on_huggingspace:
+    demo.queue(max_size=20)
+    demo.launch(debug=True)
+else:
+    demo.queue(api_open=False).launch(share=args.public_access)

i2v_enhance/i2v_enhance_interface.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+from i2v_enhance.pipeline_i2vgen_xl import I2VGenXLPipeline
+from tqdm import tqdm
+from PIL import Image
+import numpy as np
+from einops import rearrange
+import i2v_enhance.thirdparty.VFI.config as cfg
+from i2v_enhance.thirdparty.VFI.Trainer import Model as VFI
+from pathlib import Path
+from modules.params.vfi import VFIParams
+from modules.params.i2v_enhance import I2VEnhanceParams
+from utils.loader import download_ckpt
+def vfi_init(ckpt_cfg: VFIParams, device_id=0):
+    cfg.MODEL_CONFIG['MODEL_ARCH'] = cfg.init_model_config(F=32, depth=[
+                                                           2, 2, 2, 4, 4])
+    vfi = VFI(-1)
+    ckpt_file = Path(download_ckpt(
+        local_path=ckpt_cfg.ckpt_path_local, global_path=ckpt_cfg.ckpt_path_global))
+    vfi.load_model(ckpt_file.as_posix())
+    vfi.eval()
+    vfi.device()
+    assert device_id == 0, "VFI on rank!=0 not implemented yet."
+    return vfi
+def vfi_process(video, vfi, video_len):
+    video = video[:(video_len//2+1)]
+    video = [i[:, :, :3]/255. for i in video]
+    video = [i[:, :, ::-1] for i in video]
+    video = np.stack(video, axis=0)
+    video = rearrange(torch.from_numpy(video),
+                      'b h w c -> b c h w').to("cuda", torch.float32)
+    frames = []
+    for i in tqdm(range(video.shape[0]-1), desc="VFI"):
+        I0_ = video[i:i+1, ...]
+        I2_ = video[i+1:i+2, ...]
+        frames.append((I0_[0].detach().cpu().numpy().transpose(
+            1, 2, 0) * 255.0).astype(np.uint8)[:, :, ::-1])
+        mid = (vfi.inference(I0_, I2_, TTA=True, fast_TTA=True)[
+               0].detach().cpu().numpy().transpose(1, 2, 0) * 255.0).astype(np.uint8)
+        frames.append(mid[:, :, ::-1])
+    frames.append((video[-1].detach().cpu().numpy().transpose(1,
+                  2, 0) * 255.0).astype(np.uint8)[:, :, ::-1])
+    if video_len % 2 == 0:
+        frames.append((video[-1].detach().cpu().numpy().transpose(1,
+                      2, 0) * 255.0).astype(np.uint8)[:, :, ::-1])
+    del vfi
+    del video
+    torch.cuda.empty_cache()
+    video = [Image.fromarray(frame).resize((1280, 720)) for frame in frames]
+    del frames
+    return video
+def i2v_enhance_init(i2vgen_cfg: I2VEnhanceParams):
+    generator = torch.manual_seed(8888)
+    try:
+        pipeline = I2VGenXLPipeline.from_pretrained(
+            i2vgen_cfg.ckpt_path_local, torch_dtype=torch.float16, variant="fp16")
+    except Exception as e:
+        pipeline = I2VGenXLPipeline.from_pretrained(
+            i2vgen_cfg.ckpt_path_global, torch_dtype=torch.float16, variant="fp16")
+        pipeline.save_pretrained(i2vgen_cfg.ckpt_path_local)
+    pipeline.enable_model_cpu_offload()
+    return pipeline, generator
+def i2v_enhance_process(image, video, pipeline, generator, overlap_size, strength, chunk_size=38, use_randomized_blending=False):
+    prompt = "High Quality, HQ, detailed."
+    negative_prompt = "Distorted, blurry, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
+    if use_randomized_blending:
+        # We first need to enhance key-frames (the 1st frame of each chunk)
+        video_chunks = [video[i:i+chunk_size] for i in range(0, len(
+            video), chunk_size-overlap_size) if len(video[i:i+chunk_size]) == chunk_size]
+        video_short = [chunk[0] for chunk in video_chunks]
+        # If randomized blending then we must have a list of starting images (1 for each chunk)
+        image = pipeline(
+            prompt=prompt,
+            height=720,
+            width=1280,
+            image=image,
+            video=video_short,
+            strength=strength,
+            overlap_size=0,
+            chunk_size=len(video_short),
+            num_frames=len(video_short),
+            num_inference_steps=30,
+            decode_chunk_size=1,
+            negative_prompt=negative_prompt,
+            guidance_scale=9.0,
+            generator=generator,
+        ).frames[0]
+        # Remove the last few frames (< chunk_size) of the video that do not fit into one chunk.
+        max_idx = (chunk_size - overlap_size) * \
+            (len(video_chunks) - 1) + chunk_size
+        video = video[:max_idx]
+    frames = pipeline(
+        prompt=prompt,
+        height=720,
+        width=1280,
+        image=image,
+        video=video,
+        strength=strength,
+        overlap_size=overlap_size,
+        chunk_size=chunk_size,
+        num_frames=chunk_size,
+        num_inference_steps=30,
+        decode_chunk_size=1,
+        negative_prompt=negative_prompt,
+        guidance_scale=9.0,
+        generator=generator,
+    ).frames[0]
+    return frames

i2v_enhance/pipeline_i2vgen_xl.py ADDED Viewed

	@@ -0,0 +1,988 @@

+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+import random
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import I2VGenXLPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
+        >>> pipeline = I2VGenXLPipeline.from_pretrained(
+        ...     "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipeline.enable_model_cpu_offload()
+        >>> image_url = (
+        ...     "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
+        ... )
+        >>> image = load_image(image_url).convert("RGB")
+        >>> prompt = "Papers were floating in the air on a table in the library"
+        >>> negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
+        >>> generator = torch.manual_seed(8888)
+        >>> frames = pipeline(
+        ...     prompt=prompt,
+        ...     image=image,
+        ...     num_inference_steps=50,
+        ...     negative_prompt=negative_prompt,
+        ...     guidance_scale=9.0,
+        ...     generator=generator,
+        ... ).frames[0]
+        >>> video_path = export_to_gif(frames, "i2v.gif")
+        ```
+"""
+@dataclass
+class I2VGenXLPipelineOutput(BaseOutput):
+    r"""
+     Output class for image-to-video pipeline.
+     Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError(
+            "Could not access latents of provided encoder_output")
+class I2VGenXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+):
+    r"""
+    Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/).
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`I2VGenXLUNet`]):
+            A [`I2VGenXLUNet`] to denoise the encoded video latents.
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        image_encoder: CLIPVisionModelWithProjection,
+        feature_extractor: CLIPImageProcessor,
+        unet: I2VGenXLUNet,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (
+            len(self.vae.config.block_out_channels) - 1)
+        # `do_resize=False` as we do custom resizing.
+        self.video_processor = VideoProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_resize=False)
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(
+                    prompt_embeds)
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(
+            dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            # Apply clip_skip to negative prompt embeds
+            if clip_skip is None:
+                negative_prompt_embeds = self.text_encoder(
+                    uncond_input.input_ids.to(device),
+                    attention_mask=attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+            else:
+                negative_prompt_embeds = self.text_encoder(
+                    uncond_input.input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                negative_prompt_embeds = negative_prompt_embeds[-1][-(
+                    clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                negative_prompt_embeds = self.text_encoder.text_model.final_layer_norm(
+                    negative_prompt_embeds)
+        if self.do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds, negative_prompt_embeds
+    def _encode_image(self, image, device, num_videos_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+            # Normalize the image with CLIP training stats.
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(
+            bs_embed * num_videos_per_prompt, seq_len, -1)
+        if self.do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            image_embeddings = torch.cat(
+                [negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def decode_latents(self, latents, decode_chunk_size=None):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(
+            batch_size * num_frames, channels, height, width)
+        if decode_chunk_size is not None:
+            frames = []
+            for i in range(0, latents.shape[0], decode_chunk_size):
+                frame = self.vae.decode(
+                    latents[i: i + decode_chunk_size]).sample
+                frames.append(frame)
+            image = torch.cat(frames, dim=0)
+        else:
+            image = self.vae.decode(latents).sample
+        decode_shape = (batch_size, num_frames, -1) + image.shape[2:]
+        video = image[None, :].reshape(decode_shape).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(
+            self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+    def prepare_image_latents(
+        self,
+        image,
+        device,
+        num_frames,
+        num_videos_per_prompt,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.sample()
+        image_latents = image_latents * self.vae.config.scaling_factor
+        # Add frames dimension to image latents
+        image_latents = image_latents.unsqueeze(2)
+        # Append a position mask for each subsequent frame
+        # after the intial image latent frame
+        frame_position_mask = []
+        for frame_idx in range(num_frames - 1):
+            scale = (frame_idx + 1) / (num_frames - 1)
+            frame_position_mask.append(
+                torch.ones_like(image_latents[:, :, :1]) * scale)
+        if frame_position_mask:
+            frame_position_mask = torch.cat(frame_position_mask, dim=2)
+            image_latents = torch.cat(
+                [image_latents, frame_position_mask], dim=2)
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+        if self.do_classifier_free_guidance:
+            image_latents = torch.cat([image_latents] * 2)
+        return image_latents
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, num_inference_steps - t_start
+    # Similar to image, we need to prepare the latents for the video.
+    def prepare_video_latents(
+        self, video, timestep, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        video = video.to(device=device, dtype=dtype)
+        is_long = video.shape[2] > 16
+        # change from (b, c, f, h, w) -> (b * f, c, w, h)
+        bsz, channel, frames, width, height = video.shape
+        video = video.permute(0, 2, 1, 3, 4).reshape(
+            bsz * frames, channel, width, height)
+        if video.shape[1] == 4:
+            init_latents = video
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(
+                        video[i: i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                if not is_long:
+                    # 1 step encoding
+                    init_latents = retrieve_latents(
+                        self.vae.encode(video), generator=generator)
+                else:
+                    # chunk by chunk encoding. for low-memory consumption.
+                    video_list = torch.chunk(
+                        video, video.shape[0] // 16, dim=0)
+                    with torch.no_grad():
+                        init_latents = []
+                        for video_chunk in video_list:
+                            video_chunk = retrieve_latents(
+                                self.vae.encode(video_chunk), generator=generator)
+                            init_latents.append(video_chunk)
+                        init_latents = torch.cat(init_latents, dim=0)
+                    # torch.cuda.empty_cache()
+            init_latents = self.vae.config.scaling_factor * init_latents
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `video` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator,
+                             device=device, dtype=dtype)
+        latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = latents[None, :].reshape(
+            (bsz, frames, latents.shape[1]) + latents.shape[2:]).permute(0, 2, 1, 3, 4)
+        return latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        # Now image can be either a single image or a list of images (when randomized blending is enalbled).
+        image: Union[List[PipelineImageInput], PipelineImageInput] = None,
+        video: Union[List[np.ndarray], torch.Tensor] = None,
+        strength: float = 0.97,
+        overlap_size: int = 0,
+        chunk_size: int = 38,
+        height: Optional[int] = 720,
+        width: Optional[int] = 1280,
+        target_fps: Optional[int] = 38,
+        num_frames: int = 38,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        decode_chunk_size: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = 1,
+    ):
+        r"""
+        The call function to the pipeline for image-to-video generation with [`I2VGenXLPipeline`].
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            video (`List[np.ndarray]` or `torch.Tensor`):
+                Video to guide video enhancement.
+            strength (`float`, *optional*, defaults to 0.97):
+                Indicates extent to transform the reference `video`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            overlap_size (`int`, *optional*, defaults to 0):
+                This parameter is used in randomized blending, when it is enabled.
+                It defines the size of overlap between neighbouring chunks.
+            chunk_size (`int`, *optional*, defaults to 38):
+                This parameter is used in randomized blending, when it is enabled.
+                It defines the number of frames we will enhance during each chunk of randomized blending.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            target_fps (`int`, *optional*):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. This is also used as a "micro-condition" while generation.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            num_videos_per_prompt (`int`, *optional*):
+                The number of images to generate per prompt.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal
+                consistency between frames, but also the higher the memory consumption. By default, the decoder will
+                decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+        Returns:
+            [`pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, image, height, width,
+                          negative_prompt, prompt_embeds, negative_prompt_embeds)
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = guidance_scale
+        # 3.1 Encode input text prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        # 3.2 Encode image prompt
+        # 3.2.1 Image encodings.
+        # https://github.com/ali-vilab/i2vgen-xl/blob/2539c9262ff8a2a22fa9daecbfd13f0a2dbc32d0/tools/inferences/inference_i2vgen_entrance.py#L114
+        # As now we can have a list of images (when randomized blending), we encode each image separately as before.
+        image_embeddings_list = []
+        for img in image:
+            cropped_image = _center_crop_wide(img, (width, width))
+            cropped_image = _resize_bilinear(
+                cropped_image, (self.feature_extractor.crop_size["width"],
+                                self.feature_extractor.crop_size["height"])
+            )
+            image_embeddings = self._encode_image(
+                cropped_image, device, num_videos_per_prompt)
+            image_embeddings_list.append(image_embeddings)
+        # 3.2.2 Image latents.
+        # As now we can have a list of images (when randomized blending), we encode each image separately as before.
+        image_latents_list = []
+        for img in image:
+            resized_image = _center_crop_wide(img, (width, height))
+            img = self.video_processor.preprocess(resized_image).to(
+                device=device, dtype=image_embeddings_list[0].dtype)
+            image_latents = self.prepare_image_latents(
+                img,
+                device=device,
+                num_frames=num_frames,
+                num_videos_per_prompt=num_videos_per_prompt,
+            )
+            image_latents_list.append(image_latents)
+        # 3.3 Prepare additional conditions for the UNet.
+        if self.do_classifier_free_guidance:
+            fps_tensor = torch.tensor([target_fps, target_fps]).to(device)
+        else:
+            fps_tensor = torch.tensor([target_fps]).to(device)
+        fps_tensor = fps_tensor.repeat(
+            batch_size * num_videos_per_prompt, 1).ravel()
+        # 3.4 Preprocess video, similar to images.
+        video = self.video_processor.preprocess_video(video).to(
+            device=device, dtype=image_embeddings_list[0].dtype)
+        num_images_per_prompt = 1
+        # 4. Prepare timesteps. This will be used for modified SDEdit approach.
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(
+            batch_size * num_images_per_prompt)
+        # 5. Prepare latent variables. Now we get latents for input video.
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_video_latents(
+            video,
+            latent_timestep,
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents_denoised = torch.empty_like(latents)
+                CHUNK_START = 0
+                # Each chunk must have a corresponding 1st frame
+                for idx in range(len(image_latents_list)):
+                    latents_chunk = latents[:, :,
+                                            CHUNK_START:CHUNK_START + chunk_size]
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat(
+                        [latents_chunk] * 2) if self.do_classifier_free_guidance else latents_chunk
+                    latent_model_input = self.scheduler.scale_model_input(
+                        latent_model_input, t)
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        fps=fps_tensor,
+                        image_latents=image_latents_list[idx],
+                        image_embeddings=image_embeddings_list[idx],
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(
+                            2)
+                        noise_pred = noise_pred_uncond + guidance_scale * \
+                            (noise_pred_text - noise_pred_uncond)
+                    # reshape latents_chunk
+                    batch_size, channel, frames, width, height = latents_chunk.shape
+                    latents_chunk = latents_chunk.permute(0, 2, 1, 3, 4).reshape(
+                        batch_size * frames, channel, width, height)
+                    noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(
+                        batch_size * frames, channel, width, height)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_chunk = self.scheduler.step(
+                        noise_pred, t, latents_chunk, **extra_step_kwargs).prev_sample
+                    # reshape latents back
+                    latents_chunk = latents_chunk[None, :].reshape(
+                        batch_size, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+                    # Make sure random_offset is set correctly.
+                    if CHUNK_START == 0:
+                        random_offset = 0
+                    else:
+                        if overlap_size != 0:
+                            random_offset = random.randint(0, overlap_size - 1)
+                        else:
+                            random_offset = 0
+                    # Apply Randomized Blending.
+                    latents_denoised[:, :, CHUNK_START + random_offset:CHUNK_START +
+                                     chunk_size] = latents_chunk[:, :, random_offset:]
+                    CHUNK_START += chunk_size - overlap_size
+                latents = latents_denoised
+                if CHUNK_START + overlap_size > latents_denoised.shape[2]:
+                    raise NotImplementedError(f"Video of size={latents_denoised.shape[2]} is not dividable into chunks "
+                                              f"with size={chunk_size} and overlap={overlap_size}")
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        # 8. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(
+                latents, decode_chunk_size=decode_chunk_size)
+            video = self.video_processor.postprocess_video(
+                video=video_tensor, output_type=output_type)
+        # 9. Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return I2VGenXLPipelineOutput(frames=video)
+# The following utilities are taken and adapted from
+# https://github.com/ali-vilab/i2vgen-xl/blob/main/utils/transforms.py.
+def _convert_pt_to_pil(image: Union[torch.Tensor, List[torch.Tensor]]):
+    if isinstance(image, list) and isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, 0)
+    if isinstance(image, torch.Tensor):
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+        image_numpy = VaeImageProcessor.pt_to_numpy(image)
+        image_pil = VaeImageProcessor.numpy_to_pil(image_numpy)
+        image = image_pil
+    return image
+def _resize_bilinear(
+    image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], resolution: Tuple[int, int]
+):
+    # First convert the images to PIL in case they are float tensors (only relevant for tests now).
+    image = _convert_pt_to_pil(image)
+    if isinstance(image, list):
+        image = [u.resize(resolution, PIL.Image.BILINEAR) for u in image]
+    else:
+        image = image.resize(resolution, PIL.Image.BILINEAR)
+    return image
+def _center_crop_wide(
+    image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], resolution: Tuple[int, int]
+):
+    # First convert the images to PIL in case they are float tensors (only relevant for tests now).
+    image = _convert_pt_to_pil(image)
+    if isinstance(image, list):
+        scale = min(image[0].size[0] / resolution[0],
+                    image[0].size[1] / resolution[1])
+        image = [u.resize((round(u.width // scale), round(u.height //
+                          scale)), resample=PIL.Image.BOX) for u in image]
+        # center crop
+        x1 = (image[0].width - resolution[0]) // 2
+        y1 = (image[0].height - resolution[1]) // 2
+        image = [u.crop((x1, y1, x1 + resolution[0], y1 + resolution[1]))
+                 for u in image]
+        return image
+    else:
+        scale = min(image.size[0] / resolution[0],
+                    image.size[1] / resolution[1])
+        image = image.resize((round(image.width // scale),
+                             round(image.height // scale)), resample=PIL.Image.BOX)
+        x1 = (image.width - resolution[0]) // 2
+        y1 = (image.height - resolution[1]) // 2
+        image = image.crop((x1, y1, x1 + resolution[0], y1 + resolution[1]))
+        return image

i2v_enhance/thirdparty/VFI/Trainer.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/Trainer.py
+import torch
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import AdamW
+from i2v_enhance.thirdparty.VFI.model.loss import *
+from i2v_enhance.thirdparty.VFI.config import *
+class Model:
+    def __init__(self, local_rank):
+        backbonetype, multiscaletype = MODEL_CONFIG['MODEL_TYPE']
+        backbonecfg, multiscalecfg = MODEL_CONFIG['MODEL_ARCH']
+        self.net = multiscaletype(backbonetype(**backbonecfg), **multiscalecfg)
+        self.name = MODEL_CONFIG['LOGNAME']
+        self.device()
+        # train
+        self.optimG = AdamW(self.net.parameters(), lr=2e-4, weight_decay=1e-4)
+        self.lap = LapLoss()
+        if local_rank != -1:
+            self.net = DDP(self.net, device_ids=[local_rank], output_device=local_rank)
+    def train(self):
+        self.net.train()
+    def eval(self):
+        self.net.eval()
+    def device(self):
+        self.net.to(torch.device("cuda"))
+    def unload(self):
+        self.net.to(torch.device("cpu"))
+    def load_model(self, name=None, rank=0):
+        def convert(param):
+            return {
+            k.replace("module.", ""): v
+                for k, v in param.items()
+                if "module." in k and 'attn_mask' not in k and 'HW' not in k
+            }
+        if rank <= 0 :
+            if name is None:
+                name = self.name
+            # self.net.load_state_dict(convert(torch.load(f'ckpt/{name}.pkl')))
+            self.net.load_state_dict(convert(torch.load(f'{name}')))
+    def save_model(self, rank=0):
+        if rank == 0:
+            torch.save(self.net.state_dict(),f'ckpt/{self.name}.pkl')
+    @torch.no_grad()
+    def hr_inference(self, img0, img1, TTA = False, down_scale = 1.0, timestep = 0.5, fast_TTA = False):
+        '''
+        Infer with down_scale flow
+        Noting: return BxCxHxW
+        '''
+        def infer(imgs):
+            img0, img1 = imgs[:, :3], imgs[:, 3:6]
+            imgs_down = F.interpolate(imgs, scale_factor=down_scale, mode="bilinear", align_corners=False)
+            flow, mask = self.net.calculate_flow(imgs_down, timestep)
+            flow = F.interpolate(flow, scale_factor = 1/down_scale, mode="bilinear", align_corners=False) * (1/down_scale)
+            mask = F.interpolate(mask, scale_factor = 1/down_scale, mode="bilinear", align_corners=False)
+            af, _ = self.net.feature_bone(img0, img1)
+            pred = self.net.coraseWarp_and_Refine(imgs, af, flow, mask)
+            return pred
+        imgs = torch.cat((img0, img1), 1)
+        if fast_TTA:
+            imgs_ = imgs.flip(2).flip(3)
+            input = torch.cat((imgs, imgs_), 0)
+            preds = infer(input)
+            return (preds[0] + preds[1].flip(1).flip(2)).unsqueeze(0) / 2.
+        if TTA == False:
+            return infer(imgs)
+        else:
+            return (infer(imgs) + infer(imgs.flip(2).flip(3)).flip(2).flip(3)) / 2
+    @torch.no_grad()
+    def inference(self, img0, img1, TTA = False, timestep = 0.5, fast_TTA = False):
+        imgs = torch.cat((img0, img1), 1)
+        '''
+        Noting: return BxCxHxW
+        '''
+        if fast_TTA:
+            imgs_ = imgs.flip(2).flip(3)
+            input = torch.cat((imgs, imgs_), 0)
+            _, _, _, preds = self.net(input, timestep=timestep)
+            return (preds[0] + preds[1].flip(1).flip(2)).unsqueeze(0) / 2.
+        _, _, _, pred = self.net(imgs, timestep=timestep)
+        if TTA == False:
+            return pred
+        else:
+            _, _, _, pred2 = self.net(imgs.flip(2).flip(3), timestep=timestep)
+            return (pred + pred2.flip(2).flip(3)) / 2
+    @torch.no_grad()
+    def multi_inference(self, img0, img1, TTA = False, down_scale = 1.0, time_list=[], fast_TTA = False):
+        '''
+        Run backbone once, get multi frames at different timesteps
+        Noting: return a list of [CxHxW]
+        '''
+        assert len(time_list) > 0, 'Time_list should not be empty!'
+        def infer(imgs):
+            img0, img1 = imgs[:, :3], imgs[:, 3:6]
+            af, mf = self.net.feature_bone(img0, img1)
+            imgs_down = None
+            if down_scale != 1.0:
+                imgs_down = F.interpolate(imgs, scale_factor=down_scale, mode="bilinear", align_corners=False)
+                afd, mfd = self.net.feature_bone(imgs_down[:, :3], imgs_down[:, 3:6])
+            pred_list = []
+            for timestep in time_list:
+                if imgs_down is None:
+                    flow, mask = self.net.calculate_flow(imgs, timestep, af, mf)
+                else:
+                    flow, mask = self.net.calculate_flow(imgs_down, timestep, afd, mfd)
+                    flow = F.interpolate(flow, scale_factor = 1/down_scale, mode="bilinear", align_corners=False) * (1/down_scale)
+                    mask = F.interpolate(mask, scale_factor = 1/down_scale, mode="bilinear", align_corners=False)
+                pred = self.net.coraseWarp_and_Refine(imgs, af, flow, mask)
+                pred_list.append(pred)
+            return pred_list
+        imgs = torch.cat((img0, img1), 1)
+        if fast_TTA:
+            imgs_ = imgs.flip(2).flip(3)
+            input = torch.cat((imgs, imgs_), 0)
+            preds_lst = infer(input)
+            return [(preds_lst[i][0] + preds_lst[i][1].flip(1).flip(2))/2 for i in range(len(time_list))]
+        preds = infer(imgs)
+        if TTA is False:
+            return [preds[i][0] for i in range(len(time_list))]
+        else:
+            flip_pred = infer(imgs.flip(2).flip(3))
+            return [(preds[i][0] + flip_pred[i][0].flip(1).flip(2))/2 for i in range(len(time_list))]
+    def update(self, imgs, gt, learning_rate=0, training=True):
+        for param_group in self.optimG.param_groups:
+            param_group['lr'] = learning_rate
+        if training:
+            self.train()
+        else:
+            self.eval()
+        if training:
+            flow, mask, merged, pred = self.net(imgs)
+            loss_l1 = (self.lap(pred, gt)).mean()
+            for merge in merged:
+                loss_l1 += (self.lap(merge, gt)).mean() * 0.5
+            self.optimG.zero_grad()
+            loss_l1.backward()
+            self.optimG.step()
+            return pred, loss_l1
+        else:
+            with torch.no_grad():
+                flow, mask, merged, pred = self.net(imgs)
+                return pred, 0

i2v_enhance/thirdparty/VFI/ckpt/Put ours.pkl files here.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ here is the link to the all EMA-VFI models:https://drive.google.com/drive/folders/16jUa3HkQ85Z5lb5gce1yoaWkP-rdCd0o

i2v_enhance/thirdparty/VFI/ckpt/__init__.py ADDED Viewed

File without changes

i2v_enhance/thirdparty/VFI/config.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/config.py
+from functools import partial
+import torch.nn as nn
+from i2v_enhance.thirdparty.VFI.model import feature_extractor
+from i2v_enhance.thirdparty.VFI.model import flow_estimation
+'''==========Model config=========='''
+def init_model_config(F=32, W=7, depth=[2, 2, 2, 4, 4]):
+    '''This function should not be modified'''
+    return {
+        'embed_dims':[F, 2*F, 4*F, 8*F, 16*F],
+        'motion_dims':[0, 0, 0, 8*F//depth[-2], 16*F//depth[-1]],
+        'num_heads':[8*F//32, 16*F//32],
+        'mlp_ratios':[4, 4],
+        'qkv_bias':True,
+        'norm_layer':partial(nn.LayerNorm, eps=1e-6),
+        'depths':depth,
+        'window_sizes':[W, W]
+    }, {
+        'embed_dims':[F, 2*F, 4*F, 8*F, 16*F],
+        'motion_dims':[0, 0, 0, 8*F//depth[-2], 16*F//depth[-1]],
+        'depths':depth,
+        'num_heads':[8*F//32, 16*F//32],
+        'window_sizes':[W, W],
+        'scales':[4, 8, 16],
+        'hidden_dims':[4*F, 4*F],
+        'c':F
+    }
+MODEL_CONFIG = {
+    'LOGNAME': 'ours',
+    'MODEL_TYPE': (feature_extractor, flow_estimation),
+    'MODEL_ARCH': init_model_config(
+        F = 32,
+        W = 7,
+        depth = [2, 2, 2, 4, 4]
+    )
+}
+# MODEL_CONFIG = {
+#     'LOGNAME': 'ours_small',
+#     'MODEL_TYPE': (feature_extractor, flow_estimation),
+#     'MODEL_ARCH': init_model_config(
+#         F = 16,
+#         W = 7,
+#         depth = [2, 2, 2, 2, 2]
+#     )
+# }

i2v_enhance/thirdparty/VFI/dataset.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/dataset.py
+import cv2
+import os
+import torch
+import numpy as np
+import random
+from torch.utils.data import Dataset
+from config import *
+cv2.setNumThreads(1)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class VimeoDataset(Dataset):
+    def __init__(self, dataset_name, path, batch_size=32, model="RIFE"):
+        self.batch_size = batch_size
+        self.dataset_name = dataset_name
+        self.model = model
+        self.h = 256
+        self.w = 448
+        self.data_root = path
+        self.image_root = os.path.join(self.data_root, 'sequences')
+        train_fn = os.path.join(self.data_root, 'tri_trainlist.txt')
+        test_fn = os.path.join(self.data_root, 'tri_testlist.txt')
+        with open(train_fn, 'r') as f:
+            self.trainlist = f.read().splitlines()
+        with open(test_fn, 'r') as f:
+            self.testlist = f.read().splitlines()
+        self.load_data()
+    def __len__(self):
+        return len(self.meta_data)
+    def load_data(self):
+        if self.dataset_name != 'test':
+            self.meta_data = self.trainlist
+        else:
+            self.meta_data = self.testlist
+    def aug(self, img0, gt, img1, h, w):
+        ih, iw, _ = img0.shape
+        x = np.random.randint(0, ih - h + 1)
+        y = np.random.randint(0, iw - w + 1)
+        img0 = img0[x:x+h, y:y+w, :]
+        img1 = img1[x:x+h, y:y+w, :]
+        gt = gt[x:x+h, y:y+w, :]
+        return img0, gt, img1
+    def getimg(self, index):
+        imgpath = os.path.join(self.image_root, self.meta_data[index])
+        imgpaths = [imgpath + '/im1.png', imgpath + '/im2.png', imgpath + '/im3.png']
+        img0 = cv2.imread(imgpaths[0])
+        gt = cv2.imread(imgpaths[1])
+        img1 = cv2.imread(imgpaths[2])
+        return img0, gt, img1
+    def __getitem__(self, index):
+        img0, gt, img1 = self.getimg(index)
+        if 'train' in self.dataset_name:
+            img0, gt, img1 = self.aug(img0, gt, img1, 256, 256)
+            if random.uniform(0, 1) < 0.5:
+                img0 = img0[:, :, ::-1]
+                img1 = img1[:, :, ::-1]
+                gt = gt[:, :, ::-1]
+            if random.uniform(0, 1) < 0.5:
+                img1, img0 = img0, img1
+            if random.uniform(0, 1) < 0.5:
+                img0 = img0[::-1]
+                img1 = img1[::-1]
+                gt = gt[::-1]
+            if random.uniform(0, 1) < 0.5:
+                img0 = img0[:, ::-1]
+                img1 = img1[:, ::-1]
+                gt = gt[:, ::-1]
+            p = random.uniform(0, 1)
+            if p < 0.25:
+                img0 = cv2.rotate(img0, cv2.ROTATE_90_CLOCKWISE)
+                gt = cv2.rotate(gt, cv2.ROTATE_90_CLOCKWISE)
+                img1 = cv2.rotate(img1, cv2.ROTATE_90_CLOCKWISE)
+            elif p < 0.5:
+                img0 = cv2.rotate(img0, cv2.ROTATE_180)
+                gt = cv2.rotate(gt, cv2.ROTATE_180)
+                img1 = cv2.rotate(img1, cv2.ROTATE_180)
+            elif p < 0.75:
+                img0 = cv2.rotate(img0, cv2.ROTATE_90_COUNTERCLOCKWISE)
+                gt = cv2.rotate(gt, cv2.ROTATE_90_COUNTERCLOCKWISE)
+                img1 = cv2.rotate(img1, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        img0 = torch.from_numpy(img0.copy()).permute(2, 0, 1)
+        img1 = torch.from_numpy(img1.copy()).permute(2, 0, 1)
+        gt = torch.from_numpy(gt.copy()).permute(2, 0, 1)
+        return torch.cat((img0, img1, gt), 0)

i2v_enhance/thirdparty/VFI/model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .feature_extractor import feature_extractor
+from .flow_estimation import MultiScaleFlow as flow_estimation
+__all__ = ['feature_extractor', 'flow_estimation']

i2v_enhance/thirdparty/VFI/model/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,516 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/model/feature_extractor.py
+import torch
+import torch.nn as nn
+import math
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+def window_partition(x, window_size):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0]*window_size[1], C)
+    )
+    return windows
+def window_reverse(windows, window_size, H, W):
+    nwB, N, C = windows.shape
+    windows = windows.view(-1, window_size[0], window_size[1], C)
+    B = int(nwB / (H * W / window_size[0] / window_size[1]))
+    x = windows.view(
+        B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def pad_if_needed(x, size, window_size):
+    n, h, w, c = size
+    pad_h = math.ceil(h / window_size[0]) * window_size[0] - h
+    pad_w = math.ceil(w / window_size[1]) * window_size[1] - w
+    if pad_h > 0 or pad_w > 0:  # center-pad the feature on H and W axes
+        img_mask = torch.zeros((1, h+pad_h, w+pad_w, 1))  # 1 H W 1
+        h_slices = (
+            slice(0, pad_h//2),
+            slice(pad_h//2, h+pad_h//2),
+            slice(h+pad_h//2, None),
+        )
+        w_slices = (
+            slice(0, pad_w//2),
+            slice(pad_w//2, w+pad_w//2),
+            slice(w+pad_w//2, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, window_size
+        )  # nW, window_size*window_size, 1
+        mask_windows = mask_windows.squeeze(-1)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(
+            attn_mask != 0, float(-100.0)
+        ).masked_fill(attn_mask == 0, float(0.0))
+        return nn.functional.pad(
+            x,
+            (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2),
+        ), attn_mask
+    return x, None
+def depad_if_needed(x, size, window_size):
+    n, h, w, c = size
+    pad_h = math.ceil(h / window_size[0]) * window_size[0] - h
+    pad_w = math.ceil(w / window_size[1]) * window_size[1] - w
+    if pad_h > 0 or pad_w > 0:  # remove the center-padding on feature
+        return x[:, pad_h // 2 : pad_h // 2 + h, pad_w // 2 : pad_w // 2 + w, :].contiguous()
+    return x
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.relu = nn.ReLU(inplace=True)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class InterFrameAttention(nn.Module):
+    def __init__(self, dim, motion_dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.motion_dim = motion_dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.cor_embed = nn.Linear(2, motion_dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.motion_proj = nn.Linear(motion_dim, motion_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x1, x2, cor, H, W, mask=None):
+        B, N, C = x1.shape
+        B, N, C_c = cor.shape
+        q = self.q(x1).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        kv = self.kv(x2).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        cor_embed_ = self.cor_embed(cor)
+        cor_embed = cor_embed_.reshape(B, N, self.num_heads, self.motion_dim // self.num_heads).permute(0, 2, 1, 3)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            nW = mask.shape[0] # mask: nW, N, N
+            attn = attn.view(B // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = attn.softmax(dim=-1)
+        else:
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        c_reverse = (attn @ cor_embed).transpose(1, 2).reshape(B, N, -1)
+        motion = self.motion_proj(c_reverse-cor_embed_)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, motion
+class MotionFormerBlock(nn.Module):
+    def __init__(self, dim, motion_dim, num_heads, window_size=0, shift_size=0, mlp_ratio=4., bidirectional=True, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,):
+        super().__init__()
+        self.window_size = window_size
+        if not isinstance(self.window_size, (tuple, list)):
+            self.window_size = to_2tuple(window_size)
+        self.shift_size = shift_size
+        if not isinstance(self.shift_size, (tuple, list)):
+            self.shift_size = to_2tuple(shift_size)
+        self.bidirectional = bidirectional
+        self.norm1 = norm_layer(dim)
+        self.attn = InterFrameAttention(
+            dim,
+            motion_dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, cor, H, W, B):
+        x = x.view(2*B, H, W, -1)
+        x_pad, mask = pad_if_needed(x, x.size(), self.window_size)
+        cor_pad, _ = pad_if_needed(cor, cor.size(), self.window_size)
+        if self.shift_size[0] or self.shift_size[1]:
+            _, H_p, W_p, C = x_pad.shape
+            x_pad = torch.roll(x_pad, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2))
+            cor_pad = torch.roll(cor_pad, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2))
+            if hasattr(self, 'HW') and self.HW.item() == H_p * W_p:
+                shift_mask = self.attn_mask
+            else:
+                shift_mask = torch.zeros((1, H_p, W_p, 1))  # 1 H W 1
+                h_slices = (slice(0, -self.window_size[0]),
+                            slice(-self.window_size[0], -self.shift_size[0]),
+                            slice(-self.shift_size[0], None))
+                w_slices = (slice(0, -self.window_size[1]),
+                            slice(-self.window_size[1], -self.shift_size[1]),
+                            slice(-self.shift_size[1], None))
+                cnt = 0
+                for h in h_slices:
+                    for w in w_slices:
+                        shift_mask[:, h, w, :] = cnt
+                        cnt += 1
+                mask_windows = window_partition(shift_mask, self.window_size).squeeze(-1)
+                shift_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+                shift_mask = shift_mask.masked_fill(shift_mask != 0,
+                                float(-100.0)).masked_fill(shift_mask == 0,
+                                float(0.0))
+                if mask is not None:
+                    shift_mask = shift_mask.masked_fill(mask != 0,
+                                float(-100.0))
+                self.register_buffer("attn_mask", shift_mask)
+                self.register_buffer("HW", torch.Tensor([H_p*W_p]))
+        else:
+            shift_mask = mask
+        if shift_mask is not None:
+            shift_mask = shift_mask.to(x_pad.device)
+        _, Hw, Ww, C = x_pad.shape
+        x_win = window_partition(x_pad, self.window_size)
+        cor_win = window_partition(cor_pad, self.window_size)
+        nwB = x_win.shape[0]
+        x_norm = self.norm1(x_win)
+        x_reverse = torch.cat([x_norm[nwB//2:], x_norm[:nwB//2]])
+        x_appearence, x_motion = self.attn(x_norm, x_reverse, cor_win, H, W, shift_mask)
+        x_norm = x_norm + self.drop_path(x_appearence)
+        x_back = x_norm
+        x_back_win = window_reverse(x_back, self.window_size, Hw, Ww)
+        x_motion = window_reverse(x_motion, self.window_size, Hw, Ww)
+        if self.shift_size[0] or self.shift_size[1]:
+            x_back_win = torch.roll(x_back_win, shifts=(self.shift_size[0], self.shift_size[1]), dims=(1, 2))
+            x_motion = torch.roll(x_motion, shifts=(self.shift_size[0], self.shift_size[1]), dims=(1, 2))
+        x = depad_if_needed(x_back_win, x.size(), self.window_size).view(2*B, H * W, -1)
+        x_motion = depad_if_needed(x_motion, cor.size(), self.window_size).view(2*B, H * W, -1)
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x, x_motion
+class ConvBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, depths=2,act_layer=nn.PReLU):
+        super().__init__()
+        layers = []
+        for i in range(depths):
+            if i == 0:
+                layers.append(nn.Conv2d(in_dim, out_dim, 3,1,1))
+            else:
+                layers.append(nn.Conv2d(out_dim, out_dim, 3,1,1))
+            layers.extend([
+                act_layer(out_dim),
+            ])
+        self.conv = nn.Sequential(*layers)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+class OverlapPatchEmbed(nn.Module):
+    def __init__(self, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class CrossScalePatchEmbed(nn.Module):
+    def __init__(self, in_dims=[16,32,64], embed_dim=768):
+        super().__init__()
+        base_dim = in_dims[0]
+        layers = []
+        for i in range(len(in_dims)):
+            for j in range(2 ** i):
+                layers.append(nn.Conv2d(in_dims[-1-i], base_dim, 3, 2**(i+1), 1+j, 1+j))
+        self.layers = nn.ModuleList(layers)
+        self.proj = nn.Conv2d(base_dim * len(layers), embed_dim, 1, 1)
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, xs):
+        ys = []
+        k = 0
+        for i in range(len(xs)):
+            for _ in range(2 ** i):
+                ys.append(self.layers[k](xs[-1-i]))
+                k += 1
+        x = self.proj(torch.cat(ys,1))
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class MotionFormer(nn.Module):
+    def __init__(self, in_chans=3, embed_dims=[32, 64, 128, 256, 512], motion_dims=64, num_heads=[8, 16],
+                 mlp_ratios=[4, 4], qkv_bias=True, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[2, 2, 2, 6, 2], window_sizes=[11, 11],**kwarg):
+        super().__init__()
+        self.depths = depths
+        self.num_stages = len(embed_dims)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        self.conv_stages = self.num_stages - len(num_heads)
+        for i in range(self.num_stages):
+            if i == 0:
+                block = ConvBlock(in_chans,embed_dims[i],depths[i])
+            else:
+                if i < self.conv_stages:
+                    patch_embed = nn.Sequential(
+                        nn.Conv2d(embed_dims[i-1], embed_dims[i], 3,2,1),
+                        nn.PReLU(embed_dims[i])
+                    )
+                    block = ConvBlock(embed_dims[i],embed_dims[i],depths[i])
+                else:
+                    if i == self.conv_stages:
+                        patch_embed = CrossScalePatchEmbed(embed_dims[:i],
+                                                        embed_dim=embed_dims[i])
+                    else:
+                        patch_embed = OverlapPatchEmbed(patch_size=3,
+                                                        stride=2,
+                                                        in_chans=embed_dims[i - 1],
+                                                        embed_dim=embed_dims[i])
+                    block = nn.ModuleList([MotionFormerBlock(
+                        dim=embed_dims[i], motion_dim=motion_dims[i], num_heads=num_heads[i-self.conv_stages], window_size=window_sizes[i-self.conv_stages],
+                        shift_size= 0 if (j % 2) == 0 else window_sizes[i-self.conv_stages] // 2,
+                        mlp_ratio=mlp_ratios[i-self.conv_stages], qkv_bias=qkv_bias, qk_scale=qk_scale,
+                        drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer)
+                        for j in range(depths[i])])
+                    norm = norm_layer(embed_dims[i])
+                    setattr(self, f"norm{i + 1}", norm)
+                setattr(self, f"patch_embed{i + 1}", patch_embed)
+            cur += depths[i]
+            setattr(self, f"block{i + 1}", block)
+        self.cor = {}
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def get_cor(self, shape, device):
+        k = (str(shape), str(device))
+        if k not in self.cor:
+            tenHorizontal = torch.linspace(-1.0, 1.0, shape[2], device=device).view(
+                1, 1, 1, shape[2]).expand(shape[0], -1, shape[1], -1).permute(0, 2, 3, 1)
+            tenVertical = torch.linspace(-1.0, 1.0, shape[1], device=device).view(
+                1, 1, shape[1], 1).expand(shape[0], -1, -1, shape[2]).permute(0, 2, 3, 1)
+            self.cor[k] = torch.cat([tenHorizontal, tenVertical], -1).to(device)
+        return self.cor[k]
+    def forward(self, x1, x2):
+        B = x1.shape[0]
+        x = torch.cat([x1, x2], 0)
+        motion_features = []
+        appearence_features = []
+        xs = []
+        for i in range(self.num_stages):
+            motion_features.append([])
+            patch_embed = getattr(self, f"patch_embed{i + 1}",None)
+            block = getattr(self, f"block{i + 1}",None)
+            norm = getattr(self, f"norm{i + 1}",None)
+            if i < self.conv_stages:
+                if i > 0:
+                    x = patch_embed(x)
+                x = block(x)
+                xs.append(x)
+            else:
+                if i == self.conv_stages:
+                    x, H, W = patch_embed(xs)
+                else:
+                    x, H, W = patch_embed(x)
+                cor = self.get_cor((x.shape[0], H, W), x.device)
+                for blk in block:
+                    x, x_motion = blk(x, cor, H, W, B)
+                    motion_features[i].append(x_motion.reshape(2*B, H, W, -1).permute(0, 3, 1, 2).contiguous())
+                x = norm(x)
+                x = x.reshape(2*B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+                motion_features[i] = torch.cat(motion_features[i], 1)
+            appearence_features.append(x)
+        return appearence_features, motion_features
+class DWConv(nn.Module):
+    def __init__(self, dim):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.reshape(B, C, -1).transpose(1, 2)
+        return x
+def feature_extractor(**kargs):
+    model = MotionFormer(**kargs)
+    return model

i2v_enhance/thirdparty/VFI/model/flow_estimation.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/model/flow_estimation
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .warplayer import warp
+from .refine import *
+def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
+                  padding=padding, dilation=dilation, bias=True),
+        nn.PReLU(out_planes)
+    )
+class Head(nn.Module):
+    def __init__(self, in_planes, scale, c, in_else=17):
+        super(Head, self).__init__()
+        self.upsample = nn.Sequential(nn.PixelShuffle(2), nn.PixelShuffle(2))
+        self.scale = scale
+        self.conv = nn.Sequential(
+                                  conv(in_planes*2 // (4*4) + in_else, c),
+                                  conv(c, c),
+                                  conv(c, 5),
+                                  )
+    def forward(self, motion_feature, x, flow): # /16 /8 /4
+        motion_feature = self.upsample(motion_feature) #/4 /2 /1
+        if self.scale != 4:
+            x = F.interpolate(x, scale_factor = 4. / self.scale, mode="bilinear", align_corners=False)
+        if flow != None:
+            if self.scale != 4:
+                flow = F.interpolate(flow, scale_factor = 4. / self.scale, mode="bilinear", align_corners=False) * 4. / self.scale
+            x = torch.cat((x, flow), 1)
+        x = self.conv(torch.cat([motion_feature, x], 1))
+        if self.scale != 4:
+            x = F.interpolate(x, scale_factor = self.scale // 4, mode="bilinear", align_corners=False)
+            flow = x[:, :4] * (self.scale // 4)
+        else:
+            flow = x[:, :4]
+        mask = x[:, 4:5]
+        return flow, mask
+class MultiScaleFlow(nn.Module):
+    def __init__(self, backbone, **kargs):
+        super(MultiScaleFlow, self).__init__()
+        self.flow_num_stage = len(kargs['hidden_dims'])
+        self.feature_bone = backbone
+        self.block = nn.ModuleList([Head( kargs['motion_dims'][-1-i] * kargs['depths'][-1-i] + kargs['embed_dims'][-1-i],
+                            kargs['scales'][-1-i],
+                            kargs['hidden_dims'][-1-i],
+                            6 if i==0 else 17)
+                            for i in range(self.flow_num_stage)])
+        self.unet = Unet(kargs['c'] * 2)
+    def warp_features(self, xs, flow):
+        y0 = []
+        y1 = []
+        B = xs[0].size(0) // 2
+        for x in xs:
+            y0.append(warp(x[:B], flow[:, 0:2]))
+            y1.append(warp(x[B:], flow[:, 2:4]))
+            flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False, recompute_scale_factor=False) * 0.5
+        return y0, y1
+    def calculate_flow(self, imgs, timestep, af=None, mf=None):
+        img0, img1 = imgs[:, :3], imgs[:, 3:6]
+        B = img0.size(0)
+        flow, mask = None, None
+        # appearence_features & motion_features
+        if (af is None) or (mf is None):
+            af, mf = self.feature_bone(img0, img1)
+        for i in range(self.flow_num_stage):
+            t = torch.full(mf[-1-i][:B].shape, timestep, dtype=torch.float).cuda()
+            if flow != None:
+                warped_img0 = warp(img0, flow[:, :2])
+                warped_img1 = warp(img1, flow[:, 2:4])
+                flow_, mask_ = self.block[i](
+                    torch.cat([t*mf[-1-i][:B],(1-t)*mf[-1-i][B:],af[-1-i][:B],af[-1-i][B:]],1),
+                    torch.cat((img0, img1, warped_img0, warped_img1, mask), 1),
+                    flow
+                    )
+                flow = flow + flow_
+                mask = mask + mask_
+            else:
+                flow, mask = self.block[i](
+                    torch.cat([t*mf[-1-i][:B],(1-t)*mf[-1-i][B:],af[-1-i][:B],af[-1-i][B:]],1),
+                    torch.cat((img0, img1), 1),
+                    None
+                    )
+        return flow, mask
+    def coraseWarp_and_Refine(self, imgs, af, flow, mask):
+        img0, img1 = imgs[:, :3], imgs[:, 3:6]
+        warped_img0 = warp(img0, flow[:, :2])
+        warped_img1 = warp(img1, flow[:, 2:4])
+        c0, c1 = self.warp_features(af, flow)
+        tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
+        res = tmp[:, :3] * 2 - 1
+        mask_ = torch.sigmoid(mask)
+        merged = warped_img0 * mask_ + warped_img1 * (1 - mask_)
+        pred = torch.clamp(merged + res, 0, 1)
+        return pred
+    # Actually consist of 'calculate_flow' and 'coraseWarp_and_Refine'
+    def forward(self, x, timestep=0.5):
+        img0, img1 = x[:, :3], x[:, 3:6]
+        B = x.size(0)
+        flow_list = []
+        merged = []
+        mask_list = []
+        warped_img0 = img0
+        warped_img1 = img1
+        flow = None
+        # appearence_features & motion_features
+        af, mf = self.feature_bone(img0, img1)
+        for i in range(self.flow_num_stage):
+            t = torch.full(mf[-1-i][:B].shape, timestep, dtype=torch.float).cuda()
+            if flow != None:
+                flow_d, mask_d = self.block[i]( torch.cat([t*mf[-1-i][:B], (1-timestep)*mf[-1-i][B:],af[-1-i][:B],af[-1-i][B:]],1),
+                                                torch.cat((img0, img1, warped_img0, warped_img1, mask), 1), flow)
+                flow = flow + flow_d
+                mask = mask + mask_d
+            else:
+                flow, mask = self.block[i]( torch.cat([t*mf[-1-i][:B], (1-t)*mf[-1-i][B:],af[-1-i][:B],af[-1-i][B:]],1),
+                                            torch.cat((img0, img1), 1), None)
+            mask_list.append(torch.sigmoid(mask))
+            flow_list.append(flow)
+            warped_img0 = warp(img0, flow[:, :2])
+            warped_img1 = warp(img1, flow[:, 2:4])
+            merged.append(warped_img0 * mask_list[i] + warped_img1 * (1 - mask_list[i]))
+        c0, c1 = self.warp_features(af, flow)
+        tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
+        res = tmp[:, :3] * 2 - 1
+        pred = torch.clamp(merged[-1] + res, 0, 1)
+        return flow_list, mask_list, merged, pred

i2v_enhance/thirdparty/VFI/model/loss.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/model/loss.py
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def gauss_kernel(channels=3):
+    kernel = torch.tensor([[1., 4., 6., 4., 1],
+                           [4., 16., 24., 16., 4.],
+                           [6., 24., 36., 24., 6.],
+                           [4., 16., 24., 16., 4.],
+                           [1., 4., 6., 4., 1.]])
+    kernel /= 256.
+    kernel = kernel.repeat(channels, 1, 1, 1)
+    kernel = kernel.to(device)
+    return kernel
+def downsample(x):
+    return x[:, :, ::2, ::2]
+def upsample(x):
+    cc = torch.cat([x, torch.zeros(x.shape[0], x.shape[1], x.shape[2], x.shape[3]).to(device)], dim=3)
+    cc = cc.view(x.shape[0], x.shape[1], x.shape[2]*2, x.shape[3])
+    cc = cc.permute(0,1,3,2)
+    cc = torch.cat([cc, torch.zeros(x.shape[0], x.shape[1], x.shape[3], x.shape[2]*2).to(device)], dim=3)
+    cc = cc.view(x.shape[0], x.shape[1], x.shape[3]*2, x.shape[2]*2)
+    x_up = cc.permute(0,1,3,2)
+    return conv_gauss(x_up, 4*gauss_kernel(channels=x.shape[1]))
+def conv_gauss(img, kernel):
+    img = torch.nn.functional.pad(img, (2, 2, 2, 2), mode='reflect')
+    out = torch.nn.functional.conv2d(img, kernel, groups=img.shape[1])
+    return out
+def laplacian_pyramid(img, kernel, max_levels=3):
+    current = img
+    pyr = []
+    for level in range(max_levels):
+        filtered = conv_gauss(current, kernel)
+        down = downsample(filtered)
+        up = upsample(down)
+        diff = current-up
+        pyr.append(diff)
+        current = down
+    return pyr
+class LapLoss(torch.nn.Module):
+    def __init__(self, max_levels=5, channels=3):
+        super(LapLoss, self).__init__()
+        self.max_levels = max_levels
+        self.gauss_kernel = gauss_kernel(channels=channels)
+    def forward(self, input, target):
+        pyr_input  = laplacian_pyramid(img=input, kernel=self.gauss_kernel, max_levels=self.max_levels)
+        pyr_target = laplacian_pyramid(img=target, kernel=self.gauss_kernel, max_levels=self.max_levels)
+        return sum(torch.nn.functional.l1_loss(a, b) for a, b in zip(pyr_input, pyr_target))
+class Ternary(nn.Module):
+    def __init__(self, device):
+        super(Ternary, self).__init__()
+        patch_size = 7
+        out_channels = patch_size * patch_size
+        self.w = np.eye(out_channels).reshape(
+            (patch_size, patch_size, 1, out_channels))
+        self.w = np.transpose(self.w, (3, 2, 0, 1))
+        self.w = torch.tensor(self.w).float().to(device)
+    def transform(self, img):
+        patches = F.conv2d(img, self.w, padding=3, bias=None)
+        transf = patches - img
+        transf_norm = transf / torch.sqrt(0.81 + transf**2)
+        return transf_norm
+    def rgb2gray(self, rgb):
+        r, g, b = rgb[:, 0:1, :, :], rgb[:, 1:2, :, :], rgb[:, 2:3, :, :]
+        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
+        return gray
+    def hamming(self, t1, t2):
+        dist = (t1 - t2) ** 2
+        dist_norm = torch.mean(dist / (0.1 + dist), 1, True)
+        return dist_norm
+    def valid_mask(self, t, padding):
+        n, _, h, w = t.size()
+        inner = torch.ones(n, 1, h - 2 * padding, w - 2 * padding).type_as(t)
+        mask = F.pad(inner, [padding] * 4)
+        return mask
+    def forward(self, img0, img1):
+        img0 = self.transform(self.rgb2gray(img0))
+        img1 = self.transform(self.rgb2gray(img1))
+        return self.hamming(img0, img1) * self.valid_mask(img0, 1)

i2v_enhance/thirdparty/VFI/model/refine.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn as nn
+import math
+from timm.models.layers import trunc_normal_
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
+                  padding=padding, dilation=dilation, bias=True),
+        nn.PReLU(out_planes)
+        )
+def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
+    return nn.Sequential(
+        torch.nn.ConvTranspose2d(in_channels=in_planes, out_channels=out_planes, kernel_size=4, stride=2, padding=1, bias=True),
+        nn.PReLU(out_planes)
+        )
+class Conv2(nn.Module):
+    def __init__(self, in_planes, out_planes, stride=2):
+        super(Conv2, self).__init__()
+        self.conv1 = conv(in_planes, out_planes, 3, stride, 1)
+        self.conv2 = conv(out_planes, out_planes, 3, 1, 1)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class Unet(nn.Module):
+    def __init__(self, c, out=3):
+        super(Unet, self).__init__()
+        self.down0 = Conv2(17+c, 2*c)
+        self.down1 = Conv2(4*c, 4*c)
+        self.down2 = Conv2(8*c, 8*c)
+        self.down3 = Conv2(16*c, 16*c)
+        self.up0 = deconv(32*c, 8*c)
+        self.up1 = deconv(16*c, 4*c)
+        self.up2 = deconv(8*c, 2*c)
+        self.up3 = deconv(4*c, c)
+        self.conv = nn.Conv2d(c, out, 3, 1, 1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, img0, img1, warped_img0, warped_img1, mask, flow, c0, c1):
+        s0 = self.down0(torch.cat((img0, img1, warped_img0, warped_img1, mask, flow,c0[0], c1[0]), 1))
+        s1 = self.down1(torch.cat((s0, c0[1], c1[1]), 1))
+        s2 = self.down2(torch.cat((s1, c0[2], c1[2]), 1))
+        s3 = self.down3(torch.cat((s2, c0[3], c1[3]), 1))
+        x = self.up0(torch.cat((s3, c0[4], c1[4]), 1))
+        x = self.up1(torch.cat((x, s2), 1))
+        x = self.up2(torch.cat((x, s1), 1))
+        x = self.up3(torch.cat((x, s0), 1))
+        x = self.conv(x)
+        return torch.sigmoid(x)

i2v_enhance/thirdparty/VFI/model/warplayer.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/model/warplayer.py
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+backwarp_tenGrid = {}
+def warp(tenInput, tenFlow):
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device).view(
+            1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
+        tenVertical = torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device).view(
+            1, 1, tenFlow.shape[2], 1).expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
+        backwarp_tenGrid[k] = torch.cat(
+            [tenHorizontal, tenVertical], 1).to(device)
+    tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+                         tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)], 1)
+    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(input=tenInput, grid=g, mode='bilinear', padding_mode='border', align_corners=True)

i2v_enhance/thirdparty/VFI/train.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Adapted from https://github.com/MCG-NJU/EMA-VFI/blob/main/train.py
+import os
+import cv2
+import math
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import random
+import argparse
+from Trainer import Model
+from dataset import VimeoDataset
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data.distributed import DistributedSampler
+from config import *
+device = torch.device("cuda")
+exp = os.path.abspath('.').split('/')[-1]
+def get_learning_rate(step):
+    if step < 2000:
+        mul = step / 2000
+        return 2e-4 * mul
+    else:
+        mul = np.cos((step - 2000) / (300 * args.step_per_epoch - 2000) * math.pi) * 0.5 + 0.5
+        return (2e-4 - 2e-5) * mul + 2e-5
+def train(model, local_rank, batch_size, data_path):
+    if local_rank == 0:
+        writer = SummaryWriter('log/train_EMAVFI')
+    step = 0
+    nr_eval = 0
+    best = 0
+    dataset = VimeoDataset('train', data_path)
+    sampler = DistributedSampler(dataset)
+    train_data = DataLoader(dataset, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=True, sampler=sampler)
+    args.step_per_epoch = train_data.__len__()
+    dataset_val = VimeoDataset('test', data_path)
+    val_data = DataLoader(dataset_val, batch_size=batch_size, pin_memory=True, num_workers=8)
+    print('training...')
+    time_stamp = time.time()
+    for epoch in range(300):
+        sampler.set_epoch(epoch)
+        for i, imgs in enumerate(train_data):
+            data_time_interval = time.time() - time_stamp
+            time_stamp = time.time()
+            imgs = imgs.to(device, non_blocking=True) / 255.
+            imgs, gt = imgs[:, 0:6], imgs[:, 6:]
+            learning_rate = get_learning_rate(step)
+            _, loss = model.update(imgs, gt, learning_rate, training=True)
+            train_time_interval = time.time() - time_stamp
+            time_stamp = time.time()
+            if step % 200 == 1 and local_rank == 0:
+                writer.add_scalar('learning_rate', learning_rate, step)
+                writer.add_scalar('loss', loss, step)
+            if local_rank == 0:
+                print('epoch:{} {}/{} time:{:.2f}+{:.2f} loss:{:.4e}'.format(epoch, i, args.step_per_epoch, data_time_interval, train_time_interval, loss))
+            step += 1
+        nr_eval += 1
+        if nr_eval % 3 == 0:
+            evaluate(model, val_data, nr_eval, local_rank)
+        model.save_model(local_rank)
+        dist.barrier()
+def evaluate(model, val_data, nr_eval, local_rank):
+    if local_rank == 0:
+        writer_val = SummaryWriter('log/validate_EMAVFI')
+    psnr = []
+    for _, imgs in enumerate(val_data):
+        imgs = imgs.to(device, non_blocking=True) / 255.
+        imgs, gt = imgs[:, 0:6], imgs[:, 6:]
+        with torch.no_grad():
+            pred, _ = model.update(imgs, gt, training=False)
+        for j in range(gt.shape[0]):
+            psnr.append(-10 * math.log10(((gt[j] - pred[j]) * (gt[j] - pred[j])).mean().cpu().item()))
+    psnr = np.array(psnr).mean()
+    if local_rank == 0:
+        print(str(nr_eval), psnr)
+        writer_val.add_scalar('psnr', psnr, nr_eval)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_rank', default=0, type=int, help='local rank')
+    parser.add_argument('--world_size', default=4, type=int, help='world size')
+    parser.add_argument('--batch_size', default=8, type=int, help='batch size')
+    parser.add_argument('--data_path', type=str, help='data path of vimeo90k')
+    args = parser.parse_args()
+    torch.distributed.init_process_group(backend="nccl", world_size=args.world_size)
+    torch.cuda.set_device(args.local_rank)
+    if args.local_rank == 0 and not os.path.exists('log'):
+        os.mkdir('log')
+    seed = 1234
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = True
+    model = Model(args.local_rank)
+    train(model, args.local_rank, args.batch_size, args.data_path)

lib/__init__.py ADDED Viewed

File without changes

lib/farancia/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .libimage import IImage
+from os.path import dirname, pardir, realpath
+import os

lib/farancia/animation.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import matplotlib.pyplot as plt
+from matplotlib import animation
+class Animation:
+    JS = 0
+    HTML = 1
+    ANIMATION_MODE = HTML
+    def __init__(self, frames, fps=30):
+        """_summary_
+        Args:
+            frames (np.ndarray): _description_
+        """
+        self.frames = frames
+        self.fps = fps
+        self.anim_obj = None
+        self.anim_str = None
+    def render(self):
+        size = (self.frames.shape[2], self.frames.shape[1])
+        self.fig = plt.figure(figsize=size, dpi=1)
+        plt.axis('off')
+        img = plt.imshow(self.frames[0], cmap='gray', vmin=0, vmax=255)
+        self.fig.subplots_adjust(0, 0, 1, 1)
+        self.anim_obj = animation.FuncAnimation(
+            self.fig,
+            lambda i: img.set_data(self.frames[i, :, :, :]),
+            frames=self.frames.shape[0],
+            interval=1000 / self.fps
+        )
+        plt.close()
+        if Animation.ANIMATION_MODE == Animation.HTML:
+            self.anim_str = self.anim_obj.to_html5_video()
+        elif Animation.ANIMATION_MODE == Animation.JS:
+            self.anim_str = self.anim_obj.to_jshtml()
+        return self.anim_obj
+    def _repr_html_(self):
+        if self.anim_obj is None:
+            self.render()
+        return self.anim_str

lib/farancia/config.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ IMG_THUMBSIZE = None

lib/farancia/libimage/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from .iimage import IImage
+import math
+import numpy as np
+import warnings
+# ========= STATIC FUNCTIONS =============
+def find_max_h(images):
+    return max([x.size[1] for x in images])
+def find_max_w(images):
+    return max([x.size[0] for x in images])
+def find_max_size(images):
+    return find_max_w(images), find_max_h(images)
+def stack(images, axis = 0):
+    return IImage(np.concatenate([x.data for x in images], axis))
+def tstack(images):
+    w,h = find_max_size(images)
+    images = [x.pad2wh(w,h) for x in images]
+    return IImage(np.concatenate([x.data for x in images], 0))
+def hstack(images):
+    h = find_max_h(images)
+    images = [x.pad2wh(h = h) for x in images]
+    return IImage(np.concatenate([x.data for x in images], 2))
+def vstack(images):
+    w = find_max_w(images)
+    images = [x.pad2wh(w = w) for x in images]
+    return IImage(np.concatenate([x.data for x in images], 1))
+def grid(images, nrows = None, ncols = None):
+    combined = stack(images)
+    if nrows is not None:
+        ncols = math.ceil(combined.data.shape[0] / nrows)
+    elif ncols is not None:
+        nrows = math.ceil(combined.data.shape[0] / ncols)
+    else:
+        warnings.warn("No dimensions specified, creating a grid with 5 columns (default)")
+        ncols = 5
+        nrows = math.ceil(combined.data.shape[0] / ncols)
+    pad = nrows * ncols - combined.data.shape[0]
+    data = np.pad(combined.data, ((0,pad),(0,0),(0,0),(0,0)))
+    rows = [np.concatenate(x,1,dtype=np.uint8) for x in np.array_split(data, nrows)]
+    return IImage(np.concatenate(rows, 0, dtype = np.uint8)[None])

lib/farancia/libimage/iimage.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import io
+import math
+import os
+import PIL.Image
+import numpy as np
+import imageio.v3 as iio
+import warnings
+from torchvision.utils import flow_to_image
+import torch
+import torchvision.transforms.functional as TF
+from scipy.ndimage import binary_dilation, binary_erosion
+import cv2
+from ..animation import Animation
+from .. import config
+from .. import libimage
+import re
+def torch2np(x, vmin=-1, vmax=1):
+    if x.ndim != 4:
+        # raise Exception("Please only use (B,C,H,W) torch tensors!")
+        warnings.warn(
+            "Warning! Shape of the image was not provided in (B,C,H,W) format, the shape was inferred automatically!")
+        if x.ndim == 3:
+            x = x[None]
+        if x.ndim == 2:
+            x = x[None, None]
+    assert x.shape[1] == 3 or x.shape[1] == 1
+    x = x.detach().cpu().float()
+    if x.dtype == torch.uint8:
+        return x.numpy().astype(np.uint8)
+    elif vmin is not None and vmax is not None:
+        x = (255 * (x.clip(vmin, vmax) - vmin) / (vmax - vmin))
+        x = x.permute(0, 2, 3, 1).to(torch.uint8)
+        return x.numpy()
+    else:
+        raise NotImplementedError()
+class IImage:
+    '''
+    Generic media storage. Can store both images and videos.
+    Stores data as a numpy array by default.
+    Can be viewed in a jupyter notebook.
+    '''
+    @staticmethod
+    def open(path):
+        iio_obj = iio.imopen(path, 'r')
+        data = iio_obj.read()
+        try:
+            # .properties() does not work for images but for gif files
+            if not iio_obj.properties().is_batch:
+                data = data[None]
+        except AttributeError as e:
+            # this one works for gif files
+            if not "duration" in iio_obj.metadata():
+                data = data[None]
+        if data.ndim == 3:
+            data = data[..., None]
+        image = IImage(data)
+        image.link = os.path.abspath(path)
+        return image
+    @staticmethod
+    def flow_field(flow):
+        flow_images = flow_to_image(flow)
+        return IImage(flow_images, vmin=0, vmax=255)
+    @staticmethod
+    def normalized(x, dims=[-1, -2]):
+        x = (x - x.amin(dims, True)) / \
+            (x.amax(dims, True) - x.amin(dims, True))
+        return IImage(x, 0)
+    def numpy(self): return self.data
+    def torch(self, vmin=-1, vmax=1):
+        if self.data.ndim == 3:
+            data = self.data.transpose(2, 0, 1) / 255.
+        else:
+            data = self.data.transpose(0, 3, 1, 2) / 255.
+        return vmin + torch.from_numpy(data).float().to(self.device) * (vmax - vmin)
+    def cuda(self):
+        self.device = 'cuda'
+        return self
+    def cpu(self):
+        self.device = 'cpu'
+        return self
+    def pil(self):
+        ans = []
+        for x in self.data:
+            if x.shape[-1] == 1:
+                x = x[..., 0]
+            ans.append(PIL.Image.fromarray(x))
+        if len(ans) == 1:
+            return ans[0]
+        return ans
+    def is_iimage(self):
+        return True
+    @property
+    def shape(self): return self.data.shape
+    @property
+    def size(self): return (self.data.shape[-2], self.data.shape[-3])
+    def setFps(self, fps):
+        self.fps = fps
+        self.generate_display()
+        return self
+    def __init__(self, x, vmin=-1, vmax=1, fps=None):
+        if isinstance(x, PIL.Image.Image):
+            self.data = np.array(x)
+            if self.data.ndim == 2:
+                self.data = self.data[..., None]  # (H,W,C)
+            self.data = self.data[None]  # (B,H,W,C)
+        elif isinstance(x, IImage):
+            self.data = x.data.copy()  # Simple Copy
+        elif isinstance(x, np.ndarray):
+            self.data = x.copy().astype(np.uint8)
+            if self.data.ndim == 2:
+                self.data = self.data[None, ..., None]
+            if self.data.ndim == 3:
+                warnings.warn(
+                    "Inferred dimensions for a 3D array as (H,W,C), but could've been (B,H,W)")
+                self.data = self.data[None]
+        elif isinstance(x, torch.Tensor):
+            assert x.min() >= vmin and x.max(
+            ) <= vmax, f"input data was [{x.min()},{x.max()}], but expected [{vmin},{vmax}]"
+            self.data = torch2np(x, vmin, vmax)
+        self.display_str = None
+        self.device = 'cpu'
+        self.fps = fps if fps is not None else (
+            1 if len(self.data) < 10 else 30)
+        self.link = None
+    def generate_display(self):
+        if config.IMG_THUMBSIZE is not None:
+            if self.size[1] < self.size[0]:
+                thumb = self.resize(
+                    (self.size[1]*config.IMG_THUMBSIZE//self.size[0], config.IMG_THUMBSIZE))
+            else:
+                thumb = self.resize(
+                    (config.IMG_THUMBSIZE, self.size[0]*config.IMG_THUMBSIZE//self.size[1]))
+        else:
+            thumb = self
+        if self.is_video():
+            self.anim = Animation(thumb.data, fps=self.fps)
+            self.anim.render()
+            self.display_str = self.anim.anim_str
+        else:
+            b = io.BytesIO()
+            data = thumb.data[0]
+            if data.shape[-1] == 1:
+                data = data[..., 0]
+            PIL.Image.fromarray(data).save(b, "PNG")
+            self.display_str = b.getvalue()
+        return self.display_str
+    def resize(self, size, *args, **kwargs):
+        if size is None:
+            return self
+        use_small_edge_when_int = kwargs.pop('use_small_edge_when_int', False)
+        # Backward compatibility
+        resample = kwargs.pop('filter', PIL.Image.BICUBIC)
+        resample = kwargs.pop('resample', resample)
+        if isinstance(size, int):
+            if use_small_edge_when_int:
+                h, w = self.data.shape[1:3]
+                aspect_ratio = h / w
+                size = (max(size, int(size * aspect_ratio)),
+                        max(size, int(size / aspect_ratio)))
+            else:
+                h, w = self.data.shape[1:3]
+                aspect_ratio = h / w
+                size = (min(size, int(size * aspect_ratio)),
+                        min(size, int(size / aspect_ratio)))
+        if self.size == size[::-1]:
+            return self
+        return libimage.stack([IImage(x.pil().resize(size[::-1], *args, resample=resample, **kwargs)) for x in self])
+        # return IImage(TF.resize(self.cpu().torch(0), size, *args, **kwargs), 0)
+    def pad(self, padding, *args, **kwargs):
+        return IImage(TF.pad(self.torch(0), padding=padding, *args, **kwargs), 0)
+    def padx(self, multiplier, *args, **kwargs):
+        size = np.array(self.size)
+        padding = np.concatenate(
+            [[0, 0], np.ceil(size / multiplier).astype(int) * multiplier - size])
+        return self.pad(list(padding), *args, **kwargs)
+    def pad2wh(self, w=0, h=0, **kwargs):
+        cw, ch = self.size
+        return self.pad([0, 0, max(0, w - cw), max(0, h-ch)], **kwargs)
+    def pad2square(self, *args, **kwargs):
+        if self.size[0] > self.size[1]:
+            dx = self.size[0] - self.size[1]
+            return self.pad([0, dx//2, 0, dx-dx//2], *args, **kwargs)
+        elif self.size[0] < self.size[1]:
+            dx = self.size[1] - self.size[0]
+            return self.pad([dx//2, 0, dx-dx//2, 0], *args, **kwargs)
+        return self
+    def crop2square(self, *args, **kwargs):
+        if self.size[0] > self.size[1]:
+            dx = self.size[0] - self.size[1]
+            return self.crop([dx//2, 0, self.size[1], self.size[1]], *args, **kwargs)
+        elif self.size[0] < self.size[1]:
+            dx = self.size[1] - self.size[0]
+            return self.crop([0, dx//2, self.size[0], self.size[0]], *args, **kwargs)
+        return self
+    def alpha(self):
+        return IImage(self.data[..., -1, None], fps=self.fps)
+    def rgb(self):
+        return IImage(self.pil().convert('RGB'), fps=self.fps)
+    def png(self):
+        return IImage(np.concatenate([self.data, 255 * np.ones_like(self.data)[..., :1]], -1))
+    def grid(self, nrows=None, ncols=None):
+        if nrows is not None:
+            ncols = math.ceil(self.data.shape[0] / nrows)
+        elif ncols is not None:
+            nrows = math.ceil(self.data.shape[0] / ncols)
+        else:
+            warnings.warn(
+                "No dimensions specified, creating a grid with 5 columns (default)")
+            ncols = 5
+            nrows = math.ceil(self.data.shape[0] / ncols)
+        pad = nrows * ncols - self.data.shape[0]
+        data = np.pad(self.data, ((0, pad), (0, 0), (0, 0), (0, 0)))
+        rows = [np.concatenate(x, 1, dtype=np.uint8)
+                for x in np.array_split(data, nrows)]
+        return IImage(np.concatenate(rows, 0, dtype=np.uint8)[None])
+    def hstack(self):
+        return IImage(np.concatenate(self.data, 1, dtype=np.uint8)[None])
+    def vstack(self):
+        return IImage(np.concatenate(self.data, 0, dtype=np.uint8)[None])
+    def vsplit(self, number_of_splits):
+        return IImage(np.concatenate(np.split(self.data, number_of_splits, 1)))
+    def hsplit(self, number_of_splits):
+        return IImage(np.concatenate(np.split(self.data, number_of_splits, 2)))
+    def heatmap(self, resize=None, cmap=cv2.COLORMAP_JET):
+        data = np.stack([cv2.cvtColor(cv2.applyColorMap(
+            x, cmap), cv2.COLOR_BGR2RGB) for x in self.data])
+        return IImage(data).resize(resize, use_small_edge_when_int=True)
+    def display(self):
+        try:
+            display(self)
+        except:
+            print("No display")
+        return self
+    def dilate(self, iterations=1, *args, **kwargs):
+        if iterations == 0:
+            return IImage(self.data)
+        return IImage((binary_dilation(self.data, iterations=iterations, *args, *kwargs)*255.).astype(np.uint8))
+    def erode(self, iterations=1, *args, **kwargs):
+        return IImage((binary_erosion(self.data, iterations=iterations, *args, *kwargs)*255.).astype(np.uint8))
+    def hull(self):
+        convex_hulls = []
+        for frame in self.data:
+            contours, hierarchy = cv2.findContours(
+                frame, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            contours = [x.astype(np.int32) for x in contours]
+            mask_contours = [cv2.convexHull(np.concatenate(contours))]
+            canvas = np.zeros(self.data[0].shape, np.uint8)
+            convex_hull = cv2.drawContours(
+                canvas, mask_contours, -1, (255, 0, 0), -1)
+            convex_hulls.append(convex_hull)
+        return IImage(np.array(convex_hulls))
+    def is_video(self):
+        return self.data.shape[0] > 1
+    def __getitem__(self, idx):
+        return IImage(self.data[None, idx], fps=self.fps)
+        # if self.is_video(): return IImage(self.data[idx], fps = self.fps)
+        # return self
+    def _repr_png_(self):
+        if self.is_video():
+            return None
+        if self.display_str is None:
+            self.generate_display()
+        return self.display_str
+    def _repr_html_(self):
+        if not self.is_video():
+            return None
+        if self.display_str is None:
+            self.generate_display()
+        return self.display_str
+    def save(self, path):
+        _, ext = os.path.splitext(path)
+        if self.is_video():
+            # if ext in ['.jpg', '.png']:
+            if self.display_str is None:
+                self.generate_display()
+            if ext == ".apng":
+                self.anim.anim_obj.save(path, writer="pillow")
+            else:
+                self.anim.anim_obj.save(path)
+        else:
+            data = self.data if self.data.ndim == 3 else self.data[0]
+            if data.shape[-1] == 1:
+                data = data[:, :, 0]
+            PIL.Image.fromarray(data).save(path)
+        return self
+    def to_html(self, width='auto', root_path='/'):
+        if self.display_str is None:
+            self.generate_display()
+        # print (self.display_str)
+        html_tag = bytes2html(self.display_str, width=width)
+        if self.link is not None:
+            link = os.path.relpath(self.link, root_path)
+            return f'<a href="{link}" >{html_tag}</a>'
+        return html_tag
+    def write(self, text, center=(0, 25), font_scale=0.8, color=(255, 255, 255), thickness=2):
+        if not isinstance(text, list):
+            text = [text for _ in self.data]
+        data = np.stack([cv2.putText(x.copy(), t, center, cv2.FONT_HERSHEY_COMPLEX,
+                        font_scale, color, thickness) for x, t in zip(self.data, text)])
+        return IImage(data)
+    def append_text(self, text, padding, font_scale=0.8, color=(255, 255, 255), thickness=2, scale_factor=0.9, center=(0, 0), fill=0):
+        assert np.count_nonzero(padding) == 1
+        axis_padding = np.nonzero(padding)[0][0]
+        scale_padding = padding[axis_padding]
+        y_0 = 0
+        x_0 = 0
+        if axis_padding == 0:
+            width = scale_padding
+            y_max = self.shape[1]
+        elif axis_padding == 1:
+            width = self.shape[2]
+            y_max = scale_padding
+        elif axis_padding == 2:
+            x_0 = self.shape[2]
+            width = scale_padding
+            y_max = self.shape[1]
+        elif axis_padding == 3:
+            width = self.shape[2]
+            y_0 = self.shape[1]
+            y_max = self.shape[1]+scale_padding
+        width -= center[0]
+        x_0 += center[0]
+        y_0 += center[1]
+        self = self.pad(padding, fill=fill)
+        def wrap_text(text, width, _font_scale):
+            allowed_seperator = ' |-|_|/|\n'
+            words = re.split(allowed_seperator, text)
+            # words = text.split()
+            lines = []
+            current_line = words[0]
+            sep_list = []
+            start_idx = 0
+            for start_word in words[:-1]:
+                pos = text.find(start_word, start_idx)
+                pos += len(start_word)
+                sep_list.append(text[pos])
+                start_idx = pos+1
+            for word, separator in zip(words[1:], sep_list):
+                if cv2.getTextSize(current_line + separator + word, cv2.FONT_HERSHEY_COMPLEX, _font_scale, thickness)[0][0] <= width:
+                    current_line += separator + word
+                else:
+                    if cv2.getTextSize(current_line, cv2.FONT_HERSHEY_COMPLEX, _font_scale, thickness)[0][0] <= width:
+                        lines.append(current_line)
+                        current_line = word
+                    else:
+                        return []
+            if cv2.getTextSize(current_line, cv2.FONT_HERSHEY_COMPLEX, _font_scale, thickness)[0][0] <= width:
+                lines.append(current_line)
+            else:
+                return []
+            return lines
+        def wrap_text_and_scale(text, width, _font_scale, y_0, y_max):
+            height = y_max+1
+            while height > y_max:
+                text_lines = wrap_text(text, width, _font_scale)
+                if len(text) > 0 and len(text_lines) == 0:
+                    height = y_max+1
+                else:
+                    line_height = cv2.getTextSize(
+                        text_lines[0], cv2.FONT_HERSHEY_COMPLEX, _font_scale, thickness)[0][1]
+                    height = line_height * len(text_lines) + y_0
+                # scale font if out of frame
+                if height > y_max:
+                    _font_scale = _font_scale * scale_factor
+            return text_lines, line_height, _font_scale
+        result = []
+        if not isinstance(text, list):
+            text = [text for _ in self.data]
+        else:
+            assert len(text) == len(self.data)
+        for x, t in zip(self.data, text):
+            x = x.copy()
+            text_lines, line_height, _font_scale = wrap_text_and_scale(
+                t, width, font_scale, y_0, y_max)
+            y = line_height
+            for line in text_lines:
+                x = cv2.putText(
+                    x, line, (x_0, y_0+y), cv2.FONT_HERSHEY_COMPLEX, _font_scale, color, thickness)
+                y += line_height
+            result.append(x)
+        data = np.stack(result)
+        return IImage(data)
+    # ========== OPERATORS =============
+    def __or__(self, other):
+        # TODO: fix for variable sizes
+        return IImage(np.concatenate([self.data, other.data], 2))
+    def __truediv__(self, other):
+        # TODO: fix for variable sizes
+        return IImage(np.concatenate([self.data, other.data], 1))
+    def __and__(self, other):
+        return IImage(np.concatenate([self.data, other.data], 0))
+    def __add__(self, other):
+        return IImage(0.5 * self.data + 0.5 * other.data)
+    def __mul__(self, other):
+        if isinstance(other, IImage):
+            return IImage(self.data / 255. * other.data)
+        return IImage(self.data * other / 255.)
+    def __xor__(self, other):
+        return IImage(0.5 * self.data + 0.5 * other.data + 0.5 * self.data * (other.data.sum(-1, keepdims=True) == 0))
+    def __invert__(self):
+        return IImage(255 - self.data)
+    __rmul__ = __mul__
+    def bbox(self):
+        return [cv2.boundingRect(x) for x in self.data]
+    def fill_bbox(self, bbox_list, fill=255):
+        data = self.data.copy()
+        for bbox in bbox_list:
+            x, y, w, h = bbox
+            data[:, y:y+h, x:x+w, :] = fill
+        return IImage(data)
+    def crop(self, bbox):
+        assert len(bbox) in [2, 4]
+        if len(bbox) == 2:
+            x, y = 0, 0
+            w, h = bbox
+        elif len(bbox) == 4:
+            x, y, w, h = bbox
+        return IImage(self.data[:, y:y+h, x:x+w, :])
+    # def alpha(self):
+    #     return BetterImage(self.img.split()[-1])
+    # def resize(self, size, *args, **kwargs):
+    #     if size is None: return self
+    #     return BetterImage(TF.resize(self.img, size, *args, **kwargs))
+    # def pad(self, *args):
+    #     return BetterImage(TF.pad(self.img, *args))
+    # def padx(self, mult):
+    #     size = np.array(self.img.size)
+    #     padding = np.concatenate([[0,0],np.ceil(size / mult).astype(int) * mult - size])
+    #     return self.pad(list(padding))
+    # def crop(self, *args):
+    #     return BetterImage(self.img.crop(*args))
+    # def torch(self, min = -1., max = 1.):
+    #     return (max - min) * TF.to_tensor(self.img)[None] + min

lib/farancia/libimage/utils.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from IPython.display import Image as IpyImage
+def bytes2html(data, width='auto'):
+    img_obj = IpyImage(data=data, format='JPG')
+    for bundle in img_obj._repr_mimebundle_():
+        for mimetype, b64value in bundle.items():
+            if mimetype.startswith('image/'):
+                return f'<img src="data:{mimetype};base64,{b64value}" style="width: {width}; max-width: 100%">'

models/cam/conditioning.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+import torch.nn as nn
+from einops import rearrange
+from diffusers.models.attention_processor import Attention
+class CrossAttention(nn.Module):
+    """
+    CrossAttention module implements per-pixel temporal attention to fuse the conditional attention module with the base module.
+    Args:
+        input_channels (int): Number of input channels.
+        attention_head_dim (int): Dimension of attention head.
+        norm_num_groups (int): Number of groups for GroupNorm normalization (default is 32).
+    Attributes:
+        attention (Attention): Attention module for computing attention scores.
+        norm (torch.nn.GroupNorm): Group normalization layer.
+        proj_in (nn.Linear): Linear layer for projecting input data.
+        proj_out (nn.Linear): Linear layer for projecting output data.
+        dropout (nn.Dropout): Dropout layer for regularization.
+    Methods:
+        forward(hidden_state, encoder_hidden_states, num_frames, num_conditional_frames):
+            Forward pass of the CrossAttention module.
+    """
+    def __init__(self, input_channels, attention_head_dim, norm_num_groups=32):
+        super().__init__()
+        self.attention = Attention(
+            query_dim=input_channels, cross_attention_dim=input_channels, heads=input_channels//attention_head_dim, dim_head=attention_head_dim, bias=False, upcast_attention=False)
+        self.norm = torch.nn.GroupNorm(
+            num_groups=norm_num_groups, num_channels=input_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(input_channels, input_channels)
+        self.proj_out = nn.Linear(input_channels, input_channels)
+        self.dropout = nn.Dropout(p=0.25)
+    def forward(self, hidden_state, encoder_hidden_states, num_frames, num_conditional_frames):
+        """
+        The input hidden state is normalized, then projected using a linear layer.
+        Multi-head cross attention is computed between the hidden state (latent of noisy video) and encoder hidden states (CLIP image encoder).
+        The output is projected using a linear layer.
+        We apply dropout to the newly generated frames (without the control frames).
+        Args:
+            hidden_state (torch.Tensor): Input hidden state tensor.
+            encoder_hidden_states (torch.Tensor): Encoder hidden states tensor.
+            num_frames (int): Number of frames.
+            num_conditional_frames (int): Number of conditional frames.
+        Returns:
+            output (torch.Tensor): Output tensor after processing with attention mechanism.
+        """
+        h, w = hidden_state.shape[2], hidden_state.shape[3]
+        hidden_state_norm = rearrange(
+            hidden_state, "(B F) C H W -> B C F H W", F=num_frames)
+        hidden_state_norm = self.norm(hidden_state_norm)
+        hidden_state_norm = rearrange(
+            hidden_state_norm, "B C F H W -> (B H W) F C")
+        hidden_state_norm = self.proj_in(hidden_state_norm)
+        attn = self.attention(hidden_state_norm,
+                              encoder_hidden_states=encoder_hidden_states,
+                              attention_mask=None,
+                              )
+        # proj_out
+        residual = self.proj_out(attn)  # (B H W) F C
+        hidden_state = rearrange(
+            hidden_state, "(B F) ... -> B F ...", F=num_frames)
+        hidden_state = torch.cat([hidden_state[:, :num_conditional_frames], self.dropout(
+            hidden_state[:, num_conditional_frames:])], dim=1)
+        hidden_state = rearrange(hidden_state, "B F ... -> (B F) ... ")
+        residual = rearrange(
+            residual, "(B H W) F C  -> (B F) C H W", H=h, W=w)
+        output = hidden_state + residual
+        return output
+class ConditionalModel(nn.Module):
+    """
+    ConditionalModel module performs the fusion of the conditional attention module to be base model.
+    Args:
+        input_channels (int): Number of input channels.
+        conditional_model (str): Type of conditional model to use. Currently only "cross_attention" is implemented.
+        attention_head_dim (int): Dimension of attention head (default is 64).
+    Attributes:
+        temporal_transformer (CrossAttention): CrossAttention module for temporal transformation.
+        conditional_model (str): Type of conditional model used.
+    Methods:
+        forward(sample, conditioning, num_frames=None, num_conditional_frames=None):
+            Forward pass of the ConditionalModel module.
+    """
+    def __init__(self, input_channels, conditional_model: str, attention_head_dim=64):
+        super().__init__()
+        if conditional_model == "cross_attention":
+            self.temporal_transformer = CrossAttention(
+                input_channels=input_channels, attention_head_dim=attention_head_dim)
+        else:
+            raise NotImplementedError(
+                f"mode {conditional_model} not implemented")
+        nn.init.zeros_(self.temporal_transformer.proj_out.weight)
+        nn.init.zeros_(self.temporal_transformer.proj_out.bias)
+        self.conditional_model = conditional_model
+    def forward(self, sample, conditioning, num_frames=None, num_conditional_frames=None):
+        """
+        Forward pass of the ConditionalModel module.
+        Args:
+            sample (torch.Tensor): Input sample tensor.
+            conditioning (torch.Tensor): Conditioning tensor containing the enconding of the conditional frames.
+            num_frames (int): Number of frames in the sample.
+            num_conditional_frames (int): Number of conditional frames.
+        Returns:
+            sample (torch.Tensor): Transformed sample tensor.
+        """
+        sample = rearrange(sample, "(B F) ... -> B F ...", F=num_frames)
+        batch_size = sample.shape[0]
+        conditioning = rearrange(
+            conditioning, "(B F) ... -> B F ...", B=batch_size)
+        assert conditioning.ndim == 5
+        assert sample.ndim == 5
+        conditioning = rearrange(conditioning, "B F C H W -> (B H W) F C")
+        sample = rearrange(sample, "B F C H W -> (B F) C H W")
+        sample = self.temporal_transformer(
+            sample, encoder_hidden_states=conditioning, num_frames=num_frames, num_conditional_frames=num_conditional_frames)
+        return sample
+if __name__ == "__main__":
+    model = CrossAttention(input_channels=320, attention_head_dim=32)

models/control/controlnet.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import torch
+import torch.nn as nn
+from typing import List, Optional, Union
+from models.svd.sgm.util import default
+from models.svd.sgm.modules.video_attention import SpatialVideoTransformer
+from models.svd.sgm.modules.diffusionmodules.openaimodel import *
+from models.diffusion.video_model import VideoResBlock, VideoUNet
+from einops import repeat, rearrange
+from models.svd.sgm.modules.diffusionmodules.wrappers import OpenAIWrapper
+class Merger(nn.Module):
+    """
+    Merges the controlnet latents with the conditioning embedding (encoding of control frames).
+    """
+    def __init__(self, merge_mode: str = "addition", input_channels=0, frame_expansion="last_frame") -> None:
+        super().__init__()
+        self.merge_mode = merge_mode
+        self.frame_expansion = frame_expansion
+    def forward(self, x, condition_signal, num_video_frames, num_video_frames_conditional):
+        x = rearrange(x, "(B F) C H W -> B F C H W", F=num_video_frames)
+        condition_signal = rearrange(
+            condition_signal, "(B F) C H W -> B F C H W", B=x.shape[0])
+        if x.shape[1] - condition_signal.shape[1] > 0:
+            if self.frame_expansion == "last_frame":
+                fillup_latent = repeat(
+                    condition_signal[:, -1], "B C H W -> B F C H W", F=x.shape[1] - condition_signal.shape[1])
+            elif self.frame_expansion == "zero":
+                fillup_latent = torch.zeros(
+                    (x.shape[0], num_video_frames-num_video_frames_conditional, *x.shape[2:]), device=x.device, dtype=x.dtype)
+            if self.frame_expansion != "none":
+                condition_signal = torch.cat(
+                    [condition_signal, fillup_latent], dim=1)
+        if self.merge_mode == "addition":
+            out = x + condition_signal
+        else:
+            raise NotImplementedError(
+                f"Merging mode {self.merge_mode} not implemented.")
+        out = rearrange(out, "B F C H W -> (B F) C H W")
+        return out
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (16, 32, 96, 256),
+        downsample: bool = True,
+        final_3d_conv: bool = False,
+        zero_init: bool = True,
+        use_controlnet_mask: bool = False,
+        use_normalization: bool = False,
+    ):
+        super().__init__()
+        self.final_3d_conv = final_3d_conv
+        self.conv_in = nn.Conv2d(
+            conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        if final_3d_conv:
+            print("USING 3D CONV in ControlNET")
+        self.blocks = nn.ModuleList([])
+        if use_normalization:
+            self.norms = nn.ModuleList([])
+        self.use_normalization = use_normalization
+        stride = 2 if downsample else 1
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(
+                nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            if use_normalization:
+                self.norms.append(nn.LayerNorm((channel_in)))
+            self.blocks.append(
+                nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=stride))
+            if use_normalization:
+                self.norms.append(nn.LayerNorm((channel_out)))
+        self.conv_out = zero_module(
+            nn.Conv2d(
+                block_out_channels[-1]+int(use_controlnet_mask), conditioning_embedding_channels, kernel_size=3, padding=1), reset=zero_init
+        )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+        if self.use_normalization:
+            for block, norm in zip(self.blocks, self.norms):
+                embedding = block(embedding)
+                embedding = rearrange(embedding, " ... C W H -> ... W H C")
+                embedding = norm(embedding)
+                embedding = rearrange(embedding, "... W H C -> ... C W H")
+                embedding = F.silu(embedding)
+        else:
+            for block in self.blocks:
+                embedding = block(embedding)
+                embedding = F.silu(embedding)
+        embedding = self.conv_out(embedding)
+        return embedding
+class ControlNet(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        num_res_blocks: int,
+        attention_resolutions: Union[List[int], int],
+        dropout: float = 0.0,
+        channel_mult: List[int] = (1, 2, 4, 8),
+        conv_resample: bool = True,
+        dims: int = 2,
+        num_classes: Optional[Union[int, str]] = None,
+        use_checkpoint: bool = False,
+        num_heads: int = -1,
+        num_head_channels: int = -1,
+        num_heads_upsample: int = -1,
+        use_scale_shift_norm: bool = False,
+        resblock_updown: bool = False,
+        transformer_depth: Union[List[int], int] = 1,
+        transformer_depth_middle: Optional[int] = None,
+        context_dim: Optional[int] = None,
+        time_downup: bool = False,
+        time_context_dim: Optional[int] = None,
+        extra_ff_mix_layer: bool = False,
+        use_spatial_context: bool = False,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        spatial_transformer_attn_type: str = "softmax",
+        video_kernel_size: Union[int, List[int]] = 3,
+        use_linear_in_transformer: bool = False,
+        adm_in_channels: Optional[int] = None,
+        disable_temporal_crossattention: bool = False,
+        max_ddpm_temb_period: int = 10000,
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (
+            16, 32, 96, 256),
+        condition_encoder: str = "",
+        use_controlnet_mask: bool = False,
+        downsample_controlnet_cond: bool = True,
+        use_image_encoder_normalization: bool = False,
+        zero_conv_mode: str = "Identity",
+        frame_expansion: str = "none",
+        merging_mode: str = "addition",
+    ):
+        super().__init__()
+        assert zero_conv_mode == "Identity", "Zero convolution not implemented"
+        assert context_dim is not None
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1
+        if num_head_channels == -1:
+            assert num_heads != -1
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.dims = dims
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.resblock_updown = resblock_updown
+        self.transformer_depth = transformer_depth
+        self.transformer_depth_middle = transformer_depth_middle
+        self.context_dim = context_dim
+        self.time_downup = time_downup
+        self.time_context_dim = time_context_dim
+        self.extra_ff_mix_layer = extra_ff_mix_layer
+        self.use_spatial_context = use_spatial_context
+        self.merge_strategy = merge_strategy
+        self.merge_factor = merge_factor
+        self.spatial_transformer_attn_type = spatial_transformer_attn_type
+        self.video_kernel_size = video_kernel_size
+        self.use_linear_in_transformer = use_linear_in_transformer
+        self.adm_in_channels = adm_in_channels
+        self.disable_temporal_crossattention = disable_temporal_crossattention
+        self.max_ddpm_temb_period = max_ddpm_temb_period
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = nn.Sequential(
+                    Timestep(model_channels),
+                    nn.Sequential(
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    ),
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        def get_attention_layer(
+            ch,
+            num_heads,
+            dim_head,
+            depth=1,
+            context_dim=None,
+            use_checkpoint=False,
+            disabled_sa=False,
+        ):
+            return SpatialVideoTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=depth,
+                context_dim=context_dim,
+                time_context_dim=time_context_dim,
+                dropout=dropout,
+                ff_in=extra_ff_mix_layer,
+                use_spatial_context=use_spatial_context,
+                merge_strategy=merge_strategy,
+                merge_factor=merge_factor,
+                checkpoint=use_checkpoint,
+                use_linear=use_linear_in_transformer,
+                attn_mode=spatial_transformer_attn_type,
+                disable_self_attn=disabled_sa,
+                disable_temporal_crossattention=disable_temporal_crossattention,
+                max_time_embed_period=max_ddpm_temb_period,
+            )
+        def get_resblock(
+            merge_factor,
+            merge_strategy,
+            video_kernel_size,
+            ch,
+            time_embed_dim,
+            dropout,
+            out_ch,
+            dims,
+            use_checkpoint,
+            use_scale_shift_norm,
+            down=False,
+            up=False,
+        ):
+            return VideoResBlock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                channels=ch,
+                emb_channels=time_embed_dim,
+                dropout=dropout,
+                out_channels=out_ch,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                down=down,
+                up=up,
+            )
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            third_down=time_downup,
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                time_embed_dim=time_embed_dim,
+                out_ch=None,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            get_attention_layer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth_middle,
+                context_dim=context_dim,
+                use_checkpoint=use_checkpoint,
+            ),
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                out_ch=None,
+                time_embed_dim=time_embed_dim,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.merger = Merger(
+            merge_mode=merging_mode, input_channels=model_channels, frame_expansion=frame_expansion)
+        conditioning_channels = 3 if downsample_controlnet_cond else 4
+        block_out_channels = (320, 640, 1280, 1280)
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            conditioning_channels=conditioning_channels,
+            block_out_channels=conditioning_embedding_out_channels,
+            downsample=downsample_controlnet_cond,
+            final_3d_conv=condition_encoder.endswith("3DConv"),
+            use_controlnet_mask=use_controlnet_mask,
+            use_normalization=use_image_encoder_normalization,
+        )
+    def forward(
+        self,
+        x: th.Tensor,
+        timesteps: th.Tensor,
+        controlnet_cond: th.Tensor,
+        context: Optional[th.Tensor] = None,
+        y: Optional[th.Tensor] = None,
+        time_context: Optional[th.Tensor] = None,
+        num_video_frames: Optional[int] = None,
+        num_video_frames_conditional: Optional[int] = None,
+        image_only_indicator: Optional[th.Tensor] = None,
+    ):
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional -> no, relax this TODO"
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False).to(x.dtype)
+        emb = self.time_embed(t_emb)
+        # TODO restrict y to [:self.num_frames] (conditonal frames)
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        h = x
+        for idx, module in enumerate(self.input_blocks):
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames,
+            )
+            if idx == 0:
+                h = self.merger(h, controlnet_cond, num_video_frames=num_video_frames,
+                                num_video_frames_conditional=num_video_frames_conditional)
+            hs.append(h)
+        h = self.middle_block(
+            h,
+            emb,
+            context=context,
+            image_only_indicator=image_only_indicator,
+            time_context=time_context,
+            num_video_frames=num_video_frames,
+        )
+        # 5. Control net blocks
+        down_block_res_samples = hs
+        mid_block_res_sample = h
+        return (down_block_res_samples, mid_block_res_sample)
+    @classmethod
+    def from_unet(cls,
+                  model: OpenAIWrapper,
+                  merging_mode: str = "addition",
+                  zero_conv_mode: str = "Identity",
+                  frame_expansion: str = "none",
+                  downsample_controlnet_cond: bool = True,
+                  use_image_encoder_normalization: bool = False,
+                  use_controlnet_mask: bool = False,
+                  condition_encoder: str = "",
+                  conditioning_embedding_out_channels: List[int] = None,
+                  ):
+        unet: VideoUNet = model.diffusion_model
+        controlnet = cls(in_channels=unet.in_channels,
+                         model_channels=unet.model_channels,
+                         out_channels=unet.out_channels,
+                         num_res_blocks=unet.num_res_blocks,
+                         attention_resolutions=unet.attention_resolutions,
+                         dropout=unet.dropout,
+                         channel_mult=unet.channel_mult,
+                         conv_resample=unet.conv_resample,
+                         dims=unet.dims,
+                         num_classes=unet.num_classes,
+                         use_checkpoint=unet.use_checkpoint,
+                         num_heads=unet.num_heads,
+                         num_head_channels=unet.num_head_channels,
+                         num_heads_upsample=unet.num_heads_upsample,
+                         use_scale_shift_norm=unet.use_scale_shift_norm,
+                         resblock_updown=unet.resblock_updown,
+                         transformer_depth=unet.transformer_depth,
+                         transformer_depth_middle=unet.transformer_depth_middle,
+                         context_dim=unet.context_dim,
+                         time_downup=unet.time_downup,
+                         time_context_dim=unet.time_context_dim,
+                         extra_ff_mix_layer=unet.extra_ff_mix_layer,
+                         use_spatial_context=unet.use_spatial_context,
+                         merge_strategy=unet.merge_strategy,
+                         merge_factor=unet.merge_factor,
+                         spatial_transformer_attn_type=unet.spatial_transformer_attn_type,
+                         video_kernel_size=unet.video_kernel_size,
+                         use_linear_in_transformer=unet.use_linear_in_transformer,
+                         adm_in_channels=unet.adm_in_channels,
+                         disable_temporal_crossattention=unet.disable_temporal_crossattention,
+                         max_ddpm_temb_period=unet.max_ddpm_temb_period,  # up to here unet params
+                         merging_mode=merging_mode,
+                         zero_conv_mode=zero_conv_mode,
+                         frame_expansion=frame_expansion,
+                         downsample_controlnet_cond=downsample_controlnet_cond,
+                         use_image_encoder_normalization=use_image_encoder_normalization,
+                         use_controlnet_mask=use_controlnet_mask,
+                         condition_encoder=condition_encoder,
+                         conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+                         )
+        controlnet: ControlNet
+        return controlnet
+def zero_module(module, reset=True):
+    if reset:
+        for p in module.parameters():
+            nn.init.zeros_(p)
+    return module

models/diffusion/discretizer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import numpy as np
+import torch
+from models.svd.sgm.modules.diffusionmodules.discretizer import Discretization
+# Implementation of https://arxiv.org/abs/2404.14507
+class AlignYourSteps(Discretization):
+    def __init__(self, sigma_min=0.002, sigma_max=80.0, rho=7.0):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.rho = rho
+    def loglinear_interp(self, t_steps, num_steps):
+        """
+        Performs log-linear interpolation of a given array of decreasing numbers.
+        """
+        xs = np.linspace(0, 1, len(t_steps))
+        ys = np.log(t_steps[::-1])
+        new_xs = np.linspace(0, 1, num_steps)
+        new_ys = np.interp(new_xs, xs, ys)
+        interped_ys = np.exp(new_ys)[::-1].copy()
+        return interped_ys
+    def get_sigmas(self, n, device="cpu"):
+        sampling_schedule = [700.00, 54.5, 15.886, 7.977,
+                             4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.002]
+        sigmas = torch.from_numpy(self.loglinear_interp(
+            sampling_schedule, n)).to(device)
+        return sigmas

models/diffusion/video_model.py ADDED Viewed

	@@ -0,0 +1,574 @@

+# Adapted from https://github.com/Stability-AI/generative-models/blob/main/sgm/modules/diffusionmodules/video_model.py
+from functools import partial
+from typing import List, Optional, Union
+from einops import rearrange
+from models.svd.sgm.modules.diffusionmodules.openaimodel import *
+from models.svd.sgm.modules.video_attention import SpatialVideoTransformer
+from models.svd.sgm.util import default
+from models.svd.sgm.modules.diffusionmodules.util import AlphaBlender
+from functools import partial
+from models.cam.conditioning import ConditionalModel
+class VideoResBlock(ResBlock):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        dropout: float,
+        video_kernel_size: Union[int, List[int]] = 3,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        use_scale_shift_norm: bool = False,
+        dims: int = 2,
+        use_checkpoint: bool = False,
+        up: bool = False,
+        down: bool = False,
+    ):
+        super().__init__(
+            channels,
+            emb_channels,
+            dropout,
+            out_channels=out_channels,
+            use_conv=use_conv,
+            use_scale_shift_norm=use_scale_shift_norm,
+            dims=dims,
+            use_checkpoint=use_checkpoint,
+            up=up,
+            down=down,
+        )
+        self.time_stack = ResBlock(
+            default(out_channels, channels),
+            emb_channels,
+            dropout=dropout,
+            dims=3,
+            out_channels=default(out_channels, channels),
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=use_checkpoint,
+            exchange_temb_dims=True,
+        )
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            rearrange_pattern="b t -> b 1 t 1 1",
+        )
+    def forward(
+        self,
+        x: th.Tensor,
+        emb: th.Tensor,
+        num_video_frames: int,
+        image_only_indicator: Optional[th.Tensor] = None,
+    ) -> th.Tensor:
+        x = super().forward(x, emb)
+        x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
+        x = self.time_stack(
+            x, rearrange(emb, "(b t) ... -> b t ...", t=num_video_frames)
+        )
+        x = self.time_mixer(
+            x_spatial=x_mix, x_temporal=x, image_only_indicator=image_only_indicator
+        )
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+class VideoUNet(nn.Module):
+    '''
+    Adapted from the vanilla SVD model. We add "cross_attention_merger_input_blocks" and "cross_attention_merger_mid_block" to incorporate the CAM control features.
+    '''
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        num_res_blocks: int,
+        num_conditional_frames: int,
+        attention_resolutions: Union[List[int], int],
+        dropout: float = 0.0,
+        channel_mult: List[int] = (1, 2, 4, 8),
+        conv_resample: bool = True,
+        dims: int = 2,
+        num_classes: Optional[Union[int, str]] = None,
+        use_checkpoint: bool = False,
+        num_heads: int = -1,
+        num_head_channels: int = -1,
+        num_heads_upsample: int = -1,
+        use_scale_shift_norm: bool = False,
+        resblock_updown: bool = False,
+        transformer_depth: Union[List[int], int] = 1,
+        transformer_depth_middle: Optional[int] = None,
+        context_dim: Optional[int] = None,
+        time_downup: bool = False,
+        time_context_dim: Optional[int] = None,
+        extra_ff_mix_layer: bool = False,
+        use_spatial_context: bool = False,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        spatial_transformer_attn_type: str = "softmax",
+        video_kernel_size: Union[int, List[int]] = 3,
+        use_linear_in_transformer: bool = False,
+        adm_in_channels: Optional[int] = None,
+        disable_temporal_crossattention: bool = False,
+        max_ddpm_temb_period: int = 10000,
+        merging_mode: str = "addition",
+        controlnet_mode: bool = False,
+        use_apm: bool = False,
+    ):
+        super().__init__()
+        assert context_dim is not None
+        self.controlnet_mode = controlnet_mode
+        if controlnet_mode:
+            assert merging_mode.startswith(
+                "attention"), "other merging modes not implemented"
+            AttentionCondModel = partial(
+                ConditionalModel, conditional_model=merging_mode.split("attention_")[1])
+            self.cross_attention_merger_input_blocks = nn.ModuleList([])
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1
+        if num_head_channels == -1:
+            assert num_heads != -1
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.dims = dims
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.resblock_updown = resblock_updown
+        self.transformer_depth = transformer_depth
+        self.transformer_depth_middle = transformer_depth_middle
+        self.context_dim = context_dim
+        self.time_downup = time_downup
+        self.time_context_dim = time_context_dim
+        self.extra_ff_mix_layer = extra_ff_mix_layer
+        self.use_spatial_context = use_spatial_context
+        self.merge_strategy = merge_strategy
+        self.merge_factor = merge_factor
+        self.spatial_transformer_attn_type = spatial_transformer_attn_type
+        self.video_kernel_size = video_kernel_size
+        self.use_linear_in_transformer = use_linear_in_transformer
+        self.adm_in_channels = adm_in_channels
+        self.disable_temporal_crossattention = disable_temporal_crossattention
+        self.max_ddpm_temb_period = max_ddpm_temb_period
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = nn.Sequential(
+                    Timestep(model_channels),
+                    nn.Sequential(
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    ),
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        if controlnet_mode and merging_mode.startswith("attention"):
+            self.cross_attention_merger_input_blocks.append(
+                AttentionCondModel(input_channels=ch))
+        def get_attention_layer(
+            ch,
+            num_heads,
+            dim_head,
+            depth=1,
+            context_dim=None,
+            use_checkpoint=False,
+            disabled_sa=False,
+            use_apm: bool = False,
+        ):
+            return SpatialVideoTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=depth,
+                context_dim=context_dim,
+                time_context_dim=time_context_dim,
+                dropout=dropout,
+                ff_in=extra_ff_mix_layer,
+                use_spatial_context=use_spatial_context,
+                merge_strategy=merge_strategy,
+                merge_factor=merge_factor,
+                checkpoint=use_checkpoint,
+                use_linear=use_linear_in_transformer,
+                attn_mode=spatial_transformer_attn_type,
+                disable_self_attn=disabled_sa,
+                disable_temporal_crossattention=disable_temporal_crossattention,
+                max_time_embed_period=max_ddpm_temb_period,
+                use_apm=use_apm,
+            )
+        def get_resblock(
+            merge_factor,
+            merge_strategy,
+            video_kernel_size,
+            ch,
+            time_embed_dim,
+            dropout,
+            out_ch,
+            dims,
+            use_checkpoint,
+            use_scale_shift_norm,
+            down=False,
+            up=False,
+        ):
+            return VideoResBlock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                channels=ch,
+                emb_channels=time_embed_dim,
+                dropout=dropout,
+                out_channels=out_ch,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                down=down,
+                up=up,
+            )
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                            use_apm=use_apm,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                if controlnet_mode and merging_mode.startswith("attention"):
+                    self.cross_attention_merger_input_blocks.append(
+                        AttentionCondModel(input_channels=ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            third_down=time_downup,
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                if controlnet_mode and merging_mode.startswith("attention"):
+                    self.cross_attention_merger_input_blocks.append(
+                        AttentionCondModel(input_channels=ch))
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                time_embed_dim=time_embed_dim,
+                out_ch=None,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            get_attention_layer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth_middle,
+                context_dim=context_dim,
+                use_checkpoint=use_checkpoint,
+                use_apm=use_apm,
+            ),
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                out_ch=None,
+                time_embed_dim=time_embed_dim,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        if controlnet_mode and merging_mode.startswith("attention"):
+            self.cross_attention_merger_mid_block = AttentionCondModel(
+                input_channels=ch)
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch + ich,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                            use_apm=use_apm,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    ds //= 2
+                    layers.append(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            third_up=time_downup,
+                        )
+                    )
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels,
+                        out_channels, 3, padding=1)),
+        )
+    def forward(
+        self,
+        # [28,8,72,128], i.e. (B F) (2 C) H W = concat([z_t,<cond_frames>])
+        x: th.Tensor,
+        timesteps: th.Tensor,  # [28], i.e. (B F)
+        # [28, 1, 1024], i.e. (B F) 1 T, for cross attention from clip image encoder, <cond_frames_without_noise>
+        context: Optional[th.Tensor] = None,
+        # [28, 768], i.e. (B F) T ? concat([<fps_id>,<motion_bucket_id>,<cond_aug>]
+        y: Optional[th.Tensor] = None,
+        time_context: Optional[th.Tensor] = None,  # NONE
+        num_video_frames: Optional[int] = None,  # 14
+        num_conditional_frames: Optional[int] = None,  # 8
+        # zeros, [2,14], i.e. [B, F]
+        image_only_indicator: Optional[th.Tensor] = None,
+        hs_control_input: Optional[th.Tensor] = None,  # cam features
+        hs_control_mid: Optional[th.Tensor] = None,  # cam features
+    ):
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional -> no, relax this TODO"
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False).to(x.dtype)
+        emb = self.time_embed(t_emb)
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+        h = x
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames,
+            )
+            hs.append(h)
+        # fusion of cam features with base features
+        if hs_control_input is not None:
+            new_hs = []
+            assert len(hs) == len(hs_control_input) and len(
+                hs) == len(self.cross_attention_merger_input_blocks)
+            for h_no_ctrl, h_ctrl, merger in zip(hs, hs_control_input, self.cross_attention_merger_input_blocks):
+                merged_h = merger(h_no_ctrl, h_ctrl, num_frames=num_video_frames,
+                                  num_conditional_frames=num_conditional_frames)
+                new_hs.append(merged_h)
+            hs = new_hs
+        h = self.middle_block(
+            h,
+            emb,
+            context=context,
+            image_only_indicator=image_only_indicator,
+            time_context=time_context,
+            num_video_frames=num_video_frames,
+        )
+        # fusion of cam features with base features
+        if hs_control_mid is not None:
+            h = self.cross_attention_merger_mid_block(
+                h, hs_control_mid, num_frames=num_video_frames, num_conditional_frames=num_conditional_frames)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames,
+            )
+        h = h.type(x.dtype)
+        return self.out(h)

models/diffusion/wrappers.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+from models.svd.sgm.modules.diffusionmodules.wrappers import OpenAIWrapper
+from einops import rearrange, repeat
+class StreamingWrapper(OpenAIWrapper):
+    """
+    Modelwrapper for StreamingSVD, which holds the CAM model and the base model
+    """
+    def __init__(self, diffusion_model, controlnet, num_frame_conditioning: int, compile_model: bool = False, pipeline_offloading: bool = False):
+        super().__init__(diffusion_model=diffusion_model,
+                         compile_model=compile_model)
+        self.controlnet = controlnet
+        self.num_frame_conditioning = num_frame_conditioning
+        self.pipeline_offloading = pipeline_offloading
+        if pipeline_offloading:
+            raise NotImplementedError(
+                "Pipeline offloading for StreamingI2V not implemented yet.")
+    def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs):
+        batch_size = kwargs.pop("batch_size")
+        # We apply the controlnet model only to the control frames.
+        def reduce_to_cond_frames(input):
+            input = rearrange(input, "(B F) ... -> B F ...", B=batch_size)
+            input = input[:, :self.num_frame_conditioning]
+            return rearrange(input, "B F ... -> (B F) ...")
+        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        x_ctrl = reduce_to_cond_frames(x)
+        t_ctrl = reduce_to_cond_frames(t)
+        context = c.get("crossattn", None)
+        # controlnet is not using APM so we remove potentially additional tokens
+        context_ctrl = context[:, :1]
+        context_ctrl = reduce_to_cond_frames(context_ctrl)
+        y = c.get("vector", None)
+        y_ctrl = reduce_to_cond_frames(y)
+        num_video_frames = kwargs.pop("num_video_frames")
+        image_only_indicator = kwargs.pop("image_only_indicator")
+        ctrl_img_enc_frames = repeat(
+            kwargs['ctrl_frames'], "B ... -> (2 B) ... ")
+        controlnet_cond = rearrange(
+            ctrl_img_enc_frames, "B F ... -> (B F) ...")
+        if self.diffusion_model.controlnet_mode:
+            hs_control_input, hs_control_mid = self.controlnet(x=x_ctrl,  # video latent
+                                                               timesteps=t_ctrl,  # timestep
+                                                               context=context_ctrl,  # clip image conditioning
+                                                               y=y_ctrl,  # conditionigs, e.g. fps
+                                                               controlnet_cond=controlnet_cond,  # control frames
+                                                               num_video_frames=self.num_frame_conditioning,
+                                                               num_video_frames_conditional=self.num_frame_conditioning,
+                                                               image_only_indicator=image_only_indicator[:,
+                                                                                                         :self.num_frame_conditioning]
+                                                               )
+        else:
+            hs_control_input = None
+            hs_control_mid = None
+        kwargs["hs_control_input"] = hs_control_input
+        kwargs["hs_control_mid"] = hs_control_mid
+        out = self.diffusion_model(
+            x=x,
+            timesteps=t,
+            context=context,  # must be (B F) T C
+            y=y,  # must be (B F) 768
+            num_video_frames=num_video_frames,
+            num_conditional_frames=self.num_frame_conditioning,
+            image_only_indicator=image_only_indicator,
+            hs_control_input=hs_control_input,
+            hs_control_mid=hs_control_mid,
+        )
+        return out

models/svd/sgm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from models.svd.sgm.models import AutoencodingEngine, DiffusionEngine
+from models.svd.sgm.util import get_configs_path, instantiate_from_config
+__version__ = "0.1.0"

models/svd/sgm/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dataset import StableDataModuleFromConfig

models/svd/sgm/data/cifar10.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pytorch_lightning as pl
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class CIFAR10DataDictWrapper(Dataset):
+    def __init__(self, dset):
+        super().__init__()
+        self.dset = dset
+    def __getitem__(self, i):
+        x, y = self.dset[i]
+        return {"jpg": x, "cls": y}
+    def __len__(self):
+        return len(self.dset)
+class CIFAR10Loader(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, shuffle=True):
+        super().__init__()
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+        )
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.train_dataset = CIFAR10DataDictWrapper(
+            torchvision.datasets.CIFAR10(
+                root=".data/", train=True, download=True, transform=transform
+            )
+        )
+        self.test_dataset = CIFAR10DataDictWrapper(
+            torchvision.datasets.CIFAR10(
+                root=".data/", train=False, download=True, transform=transform
+            )
+        )
+    def prepare_data(self):
+        pass
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )

models/svd/sgm/data/dataset.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Optional
+import torchdata.datapipes.iter
+import webdataset as wds
+from omegaconf import DictConfig
+from pytorch_lightning import LightningDataModule
+try:
+    from sdata import create_dataset, create_dummy_dataset, create_loader
+except ImportError as e:
+    print("#" * 100)
+    print("Datasets not yet available")
+    print("to enable, we need to add stable-datasets as a submodule")
+    print("please use ``git submodule update --init --recursive``")
+    print("and do ``pip install -e stable-datasets/`` from the root of this repo")
+    print("#" * 100)
+    exit(1)
+class StableDataModuleFromConfig(LightningDataModule):
+    def __init__(
+        self,
+        train: DictConfig,
+        validation: Optional[DictConfig] = None,
+        test: Optional[DictConfig] = None,
+        skip_val_loader: bool = False,
+        dummy: bool = False,
+    ):
+        super().__init__()
+        self.train_config = train
+        assert (
+            "datapipeline" in self.train_config and "loader" in self.train_config
+        ), "train config requires the fields `datapipeline` and `loader`"
+        self.val_config = validation
+        if not skip_val_loader:
+            if self.val_config is not None:
+                assert (
+                    "datapipeline" in self.val_config and "loader" in self.val_config
+                ), "validation config requires the fields `datapipeline` and `loader`"
+            else:
+                print(
+                    "Warning: No Validation datapipeline defined, using that one from training"
+                )
+                self.val_config = train
+        self.test_config = test
+        if self.test_config is not None:
+            assert (
+                "datapipeline" in self.test_config and "loader" in self.test_config
+            ), "test config requires the fields `datapipeline` and `loader`"
+        self.dummy = dummy
+        if self.dummy:
+            print("#" * 100)
+            print("USING DUMMY DATASET: HOPE YOU'RE DEBUGGING ;)")
+            print("#" * 100)
+    def setup(self, stage: str) -> None:
+        print("Preparing datasets")
+        if self.dummy:
+            data_fn = create_dummy_dataset
+        else:
+            data_fn = create_dataset
+        self.train_datapipeline = data_fn(**self.train_config.datapipeline)
+        if self.val_config:
+            self.val_datapipeline = data_fn(**self.val_config.datapipeline)
+        if self.test_config:
+            self.test_datapipeline = data_fn(**self.test_config.datapipeline)
+    def train_dataloader(self) -> torchdata.datapipes.iter.IterDataPipe:
+        loader = create_loader(self.train_datapipeline, **self.train_config.loader)
+        return loader
+    def val_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.val_datapipeline, **self.val_config.loader)
+    def test_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.test_datapipeline, **self.test_config.loader)

models/svd/sgm/data/mnist.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import pytorch_lightning as pl
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class MNISTDataDictWrapper(Dataset):
+    def __init__(self, dset):
+        super().__init__()
+        self.dset = dset
+    def __getitem__(self, i):
+        x, y = self.dset[i]
+        return {"jpg": x, "cls": y}
+    def __len__(self):
+        return len(self.dset)
+class MNISTLoader(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, prefetch_factor=2, shuffle=True):
+        super().__init__()
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+        )
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor if num_workers > 0 else 0
+        self.shuffle = shuffle
+        self.train_dataset = MNISTDataDictWrapper(
+            torchvision.datasets.MNIST(
+                root=".data/", train=True, download=True, transform=transform
+            )
+        )
+        self.test_dataset = MNISTDataDictWrapper(
+            torchvision.datasets.MNIST(
+                root=".data/", train=False, download=True, transform=transform
+            )
+        )
+    def prepare_data(self):
+        pass
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+if __name__ == "__main__":
+    dset = MNISTDataDictWrapper(
+        torchvision.datasets.MNIST(
+            root=".data/",
+            train=False,
+            download=True,
+            transform=transforms.Compose(
+                [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+            ),
+        )
+    )
+    ex = dset[0]

models/svd/sgm/inference/api.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import pathlib
+from dataclasses import asdict, dataclass
+from enum import Enum
+from typing import Optional
+from omegaconf import OmegaConf
+from sgm.inference.helpers import (Img2ImgDiscretizationWrapper, do_img2img,
+                                   do_sample)
+from sgm.modules.diffusionmodules.sampling import (DPMPP2MSampler,
+                                                   DPMPP2SAncestralSampler,
+                                                   EulerAncestralSampler,
+                                                   EulerEDMSampler,
+                                                   HeunEDMSampler,
+                                                   LinearMultistepSampler)
+from sgm.util import load_model_from_config
+class ModelArchitecture(str, Enum):
+    SD_2_1 = "stable-diffusion-v2-1"
+    SD_2_1_768 = "stable-diffusion-v2-1-768"
+    SDXL_V0_9_BASE = "stable-diffusion-xl-v0-9-base"
+    SDXL_V0_9_REFINER = "stable-diffusion-xl-v0-9-refiner"
+    SDXL_V1_BASE = "stable-diffusion-xl-v1-base"
+    SDXL_V1_REFINER = "stable-diffusion-xl-v1-refiner"
+class Sampler(str, Enum):
+    EULER_EDM = "EulerEDMSampler"
+    HEUN_EDM = "HeunEDMSampler"
+    EULER_ANCESTRAL = "EulerAncestralSampler"
+    DPMPP2S_ANCESTRAL = "DPMPP2SAncestralSampler"
+    DPMPP2M = "DPMPP2MSampler"
+    LINEAR_MULTISTEP = "LinearMultistepSampler"
+class Discretization(str, Enum):
+    LEGACY_DDPM = "LegacyDDPMDiscretization"
+    EDM = "EDMDiscretization"
+class Guider(str, Enum):
+    VANILLA = "VanillaCFG"
+    IDENTITY = "IdentityGuider"
+class Thresholder(str, Enum):
+    NONE = "None"
+@dataclass
+class SamplingParams:
+    width: int = 1024
+    height: int = 1024
+    steps: int = 50
+    sampler: Sampler = Sampler.DPMPP2M
+    discretization: Discretization = Discretization.LEGACY_DDPM
+    guider: Guider = Guider.VANILLA
+    thresholder: Thresholder = Thresholder.NONE
+    scale: float = 6.0
+    aesthetic_score: float = 5.0
+    negative_aesthetic_score: float = 5.0
+    img2img_strength: float = 1.0
+    orig_width: int = 1024
+    orig_height: int = 1024
+    crop_coords_top: int = 0
+    crop_coords_left: int = 0
+    sigma_min: float = 0.0292
+    sigma_max: float = 14.6146
+    rho: float = 3.0
+    s_churn: float = 0.0
+    s_tmin: float = 0.0
+    s_tmax: float = 999.0
+    s_noise: float = 1.0
+    eta: float = 1.0
+    order: int = 4
+@dataclass
+class SamplingSpec:
+    width: int
+    height: int
+    channels: int
+    factor: int
+    is_legacy: bool
+    config: str
+    ckpt: str
+    is_guided: bool
+model_specs = {
+    ModelArchitecture.SD_2_1: SamplingSpec(
+        height=512,
+        width=512,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1.yaml",
+        ckpt="v2-1_512-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SD_2_1_768: SamplingSpec(
+        height=768,
+        width=768,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1_768.yaml",
+        ckpt="v2-1_768-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_1.0.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_1.0.safetensors",
+        is_guided=True,
+    ),
+}
+class SamplingPipeline:
+    def __init__(
+        self,
+        model_id: ModelArchitecture,
+        model_path="checkpoints",
+        config_path="configs/inference",
+        device="cuda",
+        use_fp16=True,
+    ) -> None:
+        if model_id not in model_specs:
+            raise ValueError(f"Model {model_id} not supported")
+        self.model_id = model_id
+        self.specs = model_specs[self.model_id]
+        self.config = str(pathlib.Path(config_path, self.specs.config))
+        self.ckpt = str(pathlib.Path(model_path, self.specs.ckpt))
+        self.device = device
+        self.model = self._load_model(device=device, use_fp16=use_fp16)
+    def _load_model(self, device="cuda", use_fp16=True):
+        config = OmegaConf.load(self.config)
+        model = load_model_from_config(config, self.ckpt)
+        if model is None:
+            raise ValueError(f"Model {self.model_id} could not be loaded")
+        model.to(device)
+        if use_fp16:
+            model.conditioner.half()
+            model.model.half()
+        return model
+    def text_to_image(
+        self,
+        params: SamplingParams,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = params.width
+        value_dict["target_height"] = params.height
+        return do_sample(
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            params.height,
+            params.width,
+            self.specs.channels,
+            self.specs.factor,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+    def image_to_image(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        if params.img2img_strength < 1.0:
+            sampler.discretization = Img2ImgDiscretizationWrapper(
+                sampler.discretization,
+                strength=params.img2img_strength,
+            )
+        height, width = image.shape[2], image.shape[3]
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = width
+        value_dict["target_height"] = height
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+    def refiner(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = {
+            "orig_width": image.shape[3] * 8,
+            "orig_height": image.shape[2] * 8,
+            "target_width": image.shape[3] * 8,
+            "target_height": image.shape[2] * 8,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "crop_coords_top": 0,
+            "crop_coords_left": 0,
+            "aesthetic_score": 6.0,
+            "negative_aesthetic_score": 2.5,
+        }
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            skip_encode=True,
+            return_latents=return_latents,
+            filter=None,
+        )
+def get_guider_config(params: SamplingParams):
+    if params.guider == Guider.IDENTITY:
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif params.guider == Guider.VANILLA:
+        scale = params.scale
+        thresholder = params.thresholder
+        if thresholder == Thresholder.NONE:
+            dyn_thresh_config = {
+                "target": "sgm.modules.diffusionmodules.sampling_utils.NoDynamicThresholding"
+            }
+        else:
+            raise NotImplementedError
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {"scale": scale, "dyn_thresh_config": dyn_thresh_config},
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+def get_discretization_config(params: SamplingParams):
+    if params.discretization == Discretization.LEGACY_DDPM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
+        }
+    elif params.discretization == Discretization.EDM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {
+                "sigma_min": params.sigma_min,
+                "sigma_max": params.sigma_max,
+                "rho": params.rho,
+            },
+        }
+    else:
+        raise ValueError(f"unknown discretization {params.discretization}")
+    return discretization_config
+def get_sampler_config(params: SamplingParams):
+    discretization_config = get_discretization_config(params)
+    guider_config = get_guider_config(params)
+    sampler = None
+    if params.sampler == Sampler.EULER_EDM:
+        return EulerEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.HEUN_EDM:
+        return HeunEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.EULER_ANCESTRAL:
+        return EulerAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2S_ANCESTRAL:
+        return DPMPP2SAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2M:
+        return DPMPP2MSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            verbose=True,
+        )
+    if params.sampler == Sampler.LINEAR_MULTISTEP:
+        return LinearMultistepSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            order=params.order,
+            verbose=True,
+        )
+    raise ValueError(f"unknown sampler {params.sampler}!")

models/svd/sgm/inference/helpers.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import math
+import os
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from einops import rearrange
+from imwatermark import WatermarkEncoder
+from omegaconf import ListConfig
+from PIL import Image
+from torch import autocast
+from sgm.util import append_dims
+class WatermarkEmbedder:
+    def __init__(self, watermark):
+        self.watermark = watermark
+        self.num_bits = len(WATERMARK_BITS)
+        self.encoder = WatermarkEncoder()
+        self.encoder.set_watermark("bits", self.watermark)
+    def __call__(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Adds a predefined watermark to the input image
+        Args:
+            image: ([N,] B, RGB, H, W) in range [0, 1]
+        Returns:
+            same as input but watermarked
+        """
+        squeeze = len(image.shape) == 4
+        if squeeze:
+            image = image[None, ...]
+        n = image.shape[0]
+        image_np = rearrange(
+            (255 * image).detach().cpu(), "n b c h w -> (n b) h w c"
+        ).numpy()[:, :, :, ::-1]
+        # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
+        # watermarking libary expects input as cv2 BGR format
+        for k in range(image_np.shape[0]):
+            image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
+        image = torch.from_numpy(
+            rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)
+        ).to(image.device)
+        image = torch.clamp(image / 255, min=0.0, max=1.0)
+        if squeeze:
+            image = image[0]
+        return image
+# A fixed 48-bit message that was choosen at random
+# WATERMARK_MESSAGE = 0xB3EC907BB19E
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+embed_watermark = WatermarkEmbedder(WATERMARK_BITS)
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list({x.input_key for x in conditioner.embedders})
+def perform_save_locally(save_path, samples):
+    os.makedirs(os.path.join(save_path), exist_ok=True)
+    base_count = len(os.listdir(os.path.join(save_path)))
+    samples = embed_watermark(samples)
+    for sample in samples:
+        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+        Image.fromarray(sample.astype(np.uint8)).save(
+            os.path.join(save_path, f"{base_count:09}.png")
+        )
+        base_count += 1
+class Img2ImgDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
+    """
+    def __init__(self, discretization, strength: float = 1.0):
+        self.discretization = discretization
+        self.strength = strength
+        assert 0.0 <= self.strength <= 1.0
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
+        print("prune index:", max(int(self.strength * len(sigmas)), 1))
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
+def do_sample(
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    H,
+    W,
+    C,
+    F,
+    force_uc_zero_embeddings: Optional[List] = None,
+    batch2model_input: Optional[List] = None,
+    return_latents=False,
+    filter=None,
+    device="cuda",
+):
+    if force_uc_zero_embeddings is None:
+        force_uc_zero_embeddings = []
+    if batch2model_input is None:
+        batch2model_input = []
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                num_samples = [num_samples]
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    num_samples,
+                )
+                for key in batch:
+                    if isinstance(batch[key], torch.Tensor):
+                        print(key, batch[key].shape)
+                    elif isinstance(batch[key], list):
+                        print(key, [len(l) for l in batch[key]])
+                    else:
+                        print(key, batch[key])
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+                for k in c:
+                    if not k == "crossattn":
+                        c[k], uc[k] = map(
+                            lambda y: y[k][: math.prod(num_samples)].to(device), (c, uc)
+                        )
+                additional_model_inputs = {}
+                for k in batch2model_input:
+                    additional_model_inputs[k] = batch[k]
+                shape = (math.prod(num_samples), C, H // F, W // F)
+                randn = torch.randn(shape).to(device)
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                if filter is not None:
+                    samples = filter(samples)
+                if return_latents:
+                    return samples, samples_z
+                return samples
+def get_batch(keys, value_dict, N: Union[List, ListConfig], device="cuda"):
+    # Hardcoded demo setups; might undergo some changes in the future
+    batch = {}
+    batch_uc = {}
+    for key in keys:
+        if key == "txt":
+            batch["txt"] = (
+                np.repeat([value_dict["prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+            batch_uc["txt"] = (
+                np.repeat([value_dict["negative_prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+        elif key == "original_size_as_tuple":
+            batch["original_size_as_tuple"] = (
+                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "crop_coords_top_left":
+            batch["crop_coords_top_left"] = (
+                torch.tensor(
+                    [value_dict["crop_coords_top"], value_dict["crop_coords_left"]]
+                )
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "aesthetic_score":
+            batch["aesthetic_score"] = (
+                torch.tensor([value_dict["aesthetic_score"]]).to(device).repeat(*N, 1)
+            )
+            batch_uc["aesthetic_score"] = (
+                torch.tensor([value_dict["negative_aesthetic_score"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "target_size_as_tuple":
+            batch["target_size_as_tuple"] = (
+                torch.tensor([value_dict["target_height"], value_dict["target_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        else:
+            batch[key] = value_dict[key]
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+def get_input_image_tensor(image: Image.Image, device="cuda"):
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    width, height = map(
+        lambda x: x - x % 64, (w, h)
+    )  # resize to integer multiple of 64
+    image = image.resize((width, height))
+    image_array = np.array(image.convert("RGB"))
+    image_array = image_array[None].transpose(0, 3, 1, 2)
+    image_tensor = torch.from_numpy(image_array).to(dtype=torch.float32) / 127.5 - 1.0
+    return image_tensor.to(device)
+def do_img2img(
+    img,
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    force_uc_zero_embeddings=[],
+    additional_kwargs={},
+    offset_noise_level: float = 0.0,
+    return_latents=False,
+    skip_encode=False,
+    filter=None,
+    device="cuda",
+):
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [num_samples],
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+                for k in c:
+                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+                for k in additional_kwargs:
+                    c[k] = uc[k] = additional_kwargs[k]
+                if skip_encode:
+                    z = img
+                else:
+                    z = model.encode_first_stage(img)
+                noise = torch.randn_like(z)
+                sigmas = sampler.discretization(sampler.num_steps)
+                sigma = sigmas[0].to(z.device)
+                if offset_noise_level > 0.0:
+                    noise = noise + offset_noise_level * append_dims(
+                        torch.randn(z.shape[0], device=z.device), z.ndim
+                    )
+                noised_z = z + noise * append_dims(sigma, z.ndim)
+                noised_z = noised_z / torch.sqrt(
+                    1.0 + sigmas[0] ** 2.0
+                )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
+                def denoiser(x, sigma, c):
+                    return model.denoiser(model.model, x, sigma, c)
+                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                if filter is not None:
+                    samples = filter(samples)
+                if return_latents:
+                    return samples, samples_z
+                return samples

models/svd/sgm/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(
+        self,
+        warm_up_steps,
+        lr_min,
+        lr_max,
+        lr_start,
+        max_decay_steps,
+        verbosity_interval=0,
+    ):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.0
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (
+                self.lr_max - self.lr_start
+            ) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (
+                self.lr_max_decay_steps - self.lr_warm_up_steps
+            )
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_lr = lr
+            return lr
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(
+        self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0
+    ):
+        assert (
+            len(warm_up_steps)
+            == len(f_min)
+            == len(f_max)
+            == len(f_start)
+            == len(cycle_lengths)
+        )
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.0
+        self.verbosity_interval = verbosity_interval
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (
+                self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]
+            )
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_f = f
+            return f
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
+                self.cycle_lengths[cycle] - n
+            ) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f

models/svd/sgm/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from models.svd.sgm.models.autoencoder import AutoencodingEngine
2	+ from models.svd.sgm.models.diffusion import DiffusionEngine

models/svd/sgm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,615 @@

+import logging
+import math
+import re
+from abc import abstractmethod
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+from models.svd.sgm.modules.autoencoding.regularizers import AbstractRegularizer
+from models.svd.sgm.modules.ema import LitEma
+from models.svd.sgm.util import (default, get_nested_attribute, get_obj_from_str,
+                    instantiate_from_config)
+logpy = logging.getLogger(__name__)
+class AbstractAutoencoder(pl.LightningModule):
+    """
+    This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
+    unCLIP models, etc. Hence, it is fairly general, and specific features
+    (e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
+    """
+    def __init__(
+        self,
+        ema_decay: Union[None, float] = None,
+        monitor: Union[None, str] = None,
+        input_key: str = "jpg",
+    ):
+        super().__init__()
+        self.input_key = input_key
+        self.use_ema = ema_decay is not None
+        if monitor is not None:
+            self.monitor = monitor
+        if self.use_ema:
+            self.model_ema = LitEma(self, decay=ema_decay)
+            logpy.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            self.automatic_optimization = False
+    def apply_ckpt(self, ckpt: Union[None, str, dict]):
+        if ckpt is None:
+            return
+        if isinstance(ckpt, str):
+            ckpt = {
+                "target": "sgm.modules.checkpoint.CheckpointEngine",
+                "params": {"ckpt_path": ckpt},
+            }
+        engine = instantiate_from_config(ckpt)
+        engine(self)
+    @abstractmethod
+    def get_input(self, batch) -> Any:
+        raise NotImplementedError()
+    def on_train_batch_end(self, *args, **kwargs):
+        # for EMA computation
+        if self.use_ema:
+            self.model_ema(self)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                logpy.info(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    logpy.info(f"{context}: Restored training weights")
+    @abstractmethod
+    def encode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("encode()-method of abstract base class called")
+    @abstractmethod
+    def decode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("decode()-method of abstract base class called")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        logpy.info(f"loading >>> {cfg['target']} <<< optimizer from config")
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self) -> Any:
+        raise NotImplementedError()
+class AutoencodingEngine(AbstractAutoencoder):
+    """
+    Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
+    (we also restore them explicitly as special cases for legacy reasons).
+    Regularizations such as KL or VQ are moved to the regularizer class.
+    """
+    def __init__(
+        self,
+        *args,
+        encoder_config: Dict,
+        decoder_config: Dict,
+        loss_config: Dict,
+        regularizer_config: Dict,
+        optimizer_config: Union[Dict, None] = None,
+        lr_g_factor: float = 1.0,
+        trainable_ae_params: Optional[List[List[str]]] = None,
+        ae_optimizer_args: Optional[List[dict]] = None,
+        trainable_disc_params: Optional[List[List[str]]] = None,
+        disc_optimizer_args: Optional[List[dict]] = None,
+        disc_start_iter: int = 0,
+        diff_boost_factor: float = 3.0,
+        ckpt_engine: Union[None, str, dict] = None,
+        ckpt_path: Optional[str] = None,
+        additional_decode_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.automatic_optimization = False  # pytorch lightning
+        self.encoder: torch.nn.Module = instantiate_from_config(encoder_config)
+        self.decoder: torch.nn.Module = instantiate_from_config(decoder_config)
+        self.loss: torch.nn.Module = instantiate_from_config(loss_config)
+        self.regularization: AbstractRegularizer = instantiate_from_config(
+            regularizer_config
+        )
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.Adam"}
+        )
+        self.diff_boost_factor = diff_boost_factor
+        self.disc_start_iter = disc_start_iter
+        self.lr_g_factor = lr_g_factor
+        self.trainable_ae_params = trainable_ae_params
+        if self.trainable_ae_params is not None:
+            self.ae_optimizer_args = default(
+                ae_optimizer_args,
+                [{} for _ in range(len(self.trainable_ae_params))],
+            )
+            assert len(self.ae_optimizer_args) == len(self.trainable_ae_params)
+        else:
+            self.ae_optimizer_args = [{}]  # makes type consitent
+        self.trainable_disc_params = trainable_disc_params
+        if self.trainable_disc_params is not None:
+            self.disc_optimizer_args = default(
+                disc_optimizer_args,
+                [{} for _ in range(len(self.trainable_disc_params))],
+            )
+            assert len(self.disc_optimizer_args) == len(self.trainable_disc_params)
+        else:
+            self.disc_optimizer_args = [{}]  # makes type consitent
+        if ckpt_path is not None:
+            assert ckpt_engine is None, "Can't set ckpt_engine and ckpt_path"
+            logpy.warn("Checkpoint path is deprecated, use `checkpoint_egnine` instead")
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+        self.additional_decode_keys = set(default(additional_decode_keys, []))
+    def get_input(self, batch: Dict) -> torch.Tensor:
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in channels-first
+        # format (e.g., bchw instead if bhwc)
+        return batch[self.input_key]
+    def get_autoencoder_params(self) -> list:
+        params = []
+        if hasattr(self.loss, "get_trainable_autoencoder_parameters"):
+            params += list(self.loss.get_trainable_autoencoder_parameters())
+        if hasattr(self.regularization, "get_trainable_parameters"):
+            params += list(self.regularization.get_trainable_parameters())
+        params = params + list(self.encoder.parameters())
+        params = params + list(self.decoder.parameters())
+        return params
+    def get_discriminator_params(self) -> list:
+        if hasattr(self.loss, "get_trainable_parameters"):
+            params = list(self.loss.get_trainable_parameters())  # e.g., discriminator
+        else:
+            params = []
+        return params
+    def get_last_layer(self):
+        return self.decoder.get_last_layer()
+    def encode(
+        self,
+        x: torch.Tensor,
+        return_reg_log: bool = False,
+        unregularized: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
+        z = self.encoder(x)
+        if unregularized:
+            return z, dict()
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+    def decode(self, z: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.decoder(z, **kwargs)
+        return x
+    def forward(
+        self, x: torch.Tensor, **additional_decode_kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
+        z, reg_log = self.encode(x, return_reg_log=True)
+        dec = self.decode(z, **additional_decode_kwargs)
+        return z, dec, reg_log
+    def inner_training_step(
+        self, batch: dict, batch_idx: int, optimizer_idx: int = 0
+    ) -> torch.Tensor:
+        x = self.get_input(batch)
+        additional_decode_kwargs = {
+            key: batch[key] for key in self.additional_decode_keys.intersection(batch)
+        }
+        z, xrec, regularization_log = self(x, **additional_decode_kwargs)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": optimizer_idx,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "train",
+                "regularization_log": regularization_log,
+                "autoencoder": self,
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+        if optimizer_idx == 0:
+            # autoencode
+            out_loss = self.loss(x, xrec, **extra_info)
+            if isinstance(out_loss, tuple):
+                aeloss, log_dict_ae = out_loss
+            else:
+                # simple loss function
+                aeloss = out_loss
+                log_dict_ae = {"train/loss/rec": aeloss.detach()}
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=False,
+            )
+            self.log(
+                "loss",
+                aeloss.mean().detach(),
+                prog_bar=True,
+                logger=False,
+                on_epoch=False,
+                on_step=True,
+            )
+            return aeloss
+        elif optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(x, xrec, **extra_info)
+            # -> discriminator always needs to return a tuple
+            self.log_dict(
+                log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True
+            )
+            return discloss
+        else:
+            raise NotImplementedError(f"Unknown optimizer {optimizer_idx}")
+    def training_step(self, batch: dict, batch_idx: int):
+        opts = self.optimizers()
+        if not isinstance(opts, list):
+            # Non-adversarial case
+            opts = [opts]
+        optimizer_idx = batch_idx % len(opts)
+        if self.global_step < self.disc_start_iter:
+            optimizer_idx = 0
+        opt = opts[optimizer_idx]
+        opt.zero_grad()
+        with opt.toggle_model():
+            loss = self.inner_training_step(
+                batch, batch_idx, optimizer_idx=optimizer_idx
+            )
+            self.manual_backward(loss)
+        opt.step()
+    def validation_step(self, batch: dict, batch_idx: int) -> Dict:
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+            log_dict.update(log_dict_ema)
+        return log_dict
+    def _validation_step(self, batch: dict, batch_idx: int, postfix: str = "") -> Dict:
+        x = self.get_input(batch)
+        z, xrec, regularization_log = self(x)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": 0,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "val" + postfix,
+                "regularization_log": regularization_log,
+                "autoencoder": self,
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+        out_loss = self.loss(x, xrec, **extra_info)
+        if isinstance(out_loss, tuple):
+            aeloss, log_dict_ae = out_loss
+        else:
+            # simple loss function
+            aeloss = out_loss
+            log_dict_ae = {f"val{postfix}/loss/rec": aeloss.detach()}
+        full_log_dict = log_dict_ae
+        if "optimizer_idx" in extra_info:
+            extra_info["optimizer_idx"] = 1
+            discloss, log_dict_disc = self.loss(x, xrec, **extra_info)
+            full_log_dict.update(log_dict_disc)
+        self.log(
+            f"val{postfix}/loss/rec",
+            log_dict_ae[f"val{postfix}/loss/rec"],
+            sync_dist=True,
+        )
+        self.log_dict(full_log_dict, sync_dist=True)
+        return full_log_dict
+    def get_param_groups(
+        self, parameter_names: List[List[str]], optimizer_args: List[dict]
+    ) -> Tuple[List[Dict[str, Any]], int]:
+        groups = []
+        num_params = 0
+        for names, args in zip(parameter_names, optimizer_args):
+            params = []
+            for pattern_ in names:
+                pattern_params = []
+                pattern = re.compile(pattern_)
+                for p_name, param in self.named_parameters():
+                    if re.match(pattern, p_name):
+                        pattern_params.append(param)
+                        num_params += param.numel()
+                if len(pattern_params) == 0:
+                    logpy.warn(f"Did not find parameters for pattern {pattern_}")
+                params.extend(pattern_params)
+            groups.append({"params": params, **args})
+        return groups, num_params
+    def configure_optimizers(self) -> List[torch.optim.Optimizer]:
+        if self.trainable_ae_params is None:
+            ae_params = self.get_autoencoder_params()
+        else:
+            ae_params, num_ae_params = self.get_param_groups(
+                self.trainable_ae_params, self.ae_optimizer_args
+            )
+            logpy.info(f"Number of trainable autoencoder parameters: {num_ae_params:,}")
+        if self.trainable_disc_params is None:
+            disc_params = self.get_discriminator_params()
+        else:
+            disc_params, num_disc_params = self.get_param_groups(
+                self.trainable_disc_params, self.disc_optimizer_args
+            )
+            logpy.info(
+                f"Number of trainable discriminator parameters: {num_disc_params:,}"
+            )
+        opt_ae = self.instantiate_optimizer_from_config(
+            ae_params,
+            default(self.lr_g_factor, 1.0) * self.learning_rate,
+            self.optimizer_config,
+        )
+        opts = [opt_ae]
+        if len(disc_params) > 0:
+            opt_disc = self.instantiate_optimizer_from_config(
+                disc_params, self.learning_rate, self.optimizer_config
+            )
+            opts.append(opt_disc)
+        return opts
+    @torch.no_grad()
+    def log_images(
+        self, batch: dict, additional_log_kwargs: Optional[Dict] = None, **kwargs
+    ) -> dict:
+        log = dict()
+        additional_decode_kwargs = {}
+        x = self.get_input(batch)
+        additional_decode_kwargs.update(
+            {key: batch[key] for key in self.additional_decode_keys.intersection(batch)}
+        )
+        _, xrec, _ = self(x, **additional_decode_kwargs)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        diff = 0.5 * torch.abs(torch.clamp(xrec, -1.0, 1.0) - x)
+        diff.clamp_(0, 1.0)
+        log["diff"] = 2.0 * diff - 1.0
+        # diff_boost shows location of small errors, by boosting their
+        # brightness.
+        log["diff_boost"] = (
+            2.0 * torch.clamp(self.diff_boost_factor * diff, 0.0, 1.0) - 1
+        )
+        if hasattr(self.loss, "log_images"):
+            log.update(self.loss.log_images(x, xrec))
+        with self.ema_scope():
+            _, xrec_ema, _ = self(x, **additional_decode_kwargs)
+            log["reconstructions_ema"] = xrec_ema
+            diff_ema = 0.5 * torch.abs(torch.clamp(xrec_ema, -1.0, 1.0) - x)
+            diff_ema.clamp_(0, 1.0)
+            log["diff_ema"] = 2.0 * diff_ema - 1.0
+            log["diff_boost_ema"] = (
+                2.0 * torch.clamp(self.diff_boost_factor * diff_ema, 0.0, 1.0) - 1
+            )
+        if additional_log_kwargs:
+            additional_decode_kwargs.update(additional_log_kwargs)
+            _, xrec_add, _ = self(x, **additional_decode_kwargs)
+            log_str = "reconstructions-" + "-".join(
+                [f"{key}={additional_log_kwargs[key]}" for key in additional_log_kwargs]
+            )
+            log[log_str] = xrec_add
+        return log
+class AutoencodingEngineLegacy(AutoencodingEngine):
+    def __init__(self, embed_dim: int, **kwargs):
+        self.max_batch_size = kwargs.pop("max_batch_size", None)
+        ddconfig = kwargs.pop("ddconfig")
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ckpt_engine = kwargs.pop("ckpt_engine", None)
+        super().__init__(
+            encoder_config={
+                "target": "models.svd.sgm.modules.diffusionmodules.model.Encoder",
+                "params": ddconfig,
+            },
+            decoder_config={
+                "target": "models.svd.sgm.modules.diffusionmodules.model.Decoder",
+                "params": ddconfig,
+            },
+            **kwargs,
+        )
+        self.quant_conv = torch.nn.Conv2d(
+            (1 + ddconfig["double_z"]) * ddconfig["z_channels"],
+            (1 + ddconfig["double_z"]) * embed_dim,
+            1,
+        )
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+    def get_autoencoder_params(self) -> list:
+        params = super().get_autoencoder_params()
+        return params
+    def encode(
+        self, x: torch.Tensor, return_reg_log: bool = False
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
+        if self.max_batch_size is None:
+            z = self.encoder(x)
+            z = self.quant_conv(z)
+        else:
+            N = x.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            z = list()
+            for i_batch in range(n_batches):
+                z_batch = self.encoder(x[i_batch * bs : (i_batch + 1) * bs])
+                z_batch = self.quant_conv(z_batch)
+                z.append(z_batch)
+            z = torch.cat(z, 0)
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+    def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
+        if self.max_batch_size is None:
+            dec = self.post_quant_conv(z)
+            dec = self.decoder(dec, **decoder_kwargs)
+        else:
+            N = z.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            dec = list()
+            for i_batch in range(n_batches):
+                dec_batch = self.post_quant_conv(z[i_batch * bs : (i_batch + 1) * bs])
+                dec_batch = self.decoder(dec_batch, **decoder_kwargs)
+                dec.append(dec_batch)
+            dec = torch.cat(dec, 0)
+        return dec
+class AutoencoderKL(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        if "lossconfig" in kwargs:
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers"
+                    ".DiagonalGaussianRegularizer"
+                )
+            },
+            **kwargs,
+        )
+class AutoencoderLegacyVQ(AutoencodingEngineLegacy):
+    def __init__(
+        self,
+        embed_dim: int,
+        n_embed: int,
+        sane_index_shape: bool = False,
+        **kwargs,
+    ):
+        if "lossconfig" in kwargs:
+            logpy.warn(f"Parameter `lossconfig` is deprecated, use `loss_config`.")
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers.quantize" ".VectorQuantizer"
+                ),
+                "params": {
+                    "n_e": n_embed,
+                    "e_dim": embed_dim,
+                    "sane_index_shape": sane_index_shape,
+                },
+            },
+            **kwargs,
+        )
+class IdentityFirstStage(AbstractAutoencoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def get_input(self, x: Any) -> Any:
+        return x
+    def encode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+    def decode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+class AEIntegerWrapper(nn.Module):
+    def __init__(
+        self,
+        model: nn.Module,
+        shape: Union[None, Tuple[int, int], List[int]] = (16, 16),
+        regularization_key: str = "regularization",
+        encoder_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.model = model
+        assert hasattr(model, "encode") and hasattr(
+            model, "decode"
+        ), "Need AE interface"
+        self.regularization = get_nested_attribute(model, regularization_key)
+        self.shape = shape
+        self.encoder_kwargs = default(encoder_kwargs, {"return_reg_log": True})
+    def encode(self, x) -> torch.Tensor:
+        assert (
+            not self.training
+        ), f"{self.__class__.__name__} only supports inference currently"
+        _, log = self.model.encode(x, **self.encoder_kwargs)
+        assert isinstance(log, dict)
+        inds = log["min_encoding_indices"]
+        return rearrange(inds, "b ... -> b (...)")
+    def decode(
+        self, inds: torch.Tensor, shape: Union[None, tuple, list] = None
+    ) -> torch.Tensor:
+        # expect inds shape (b, s) with s = h*w
+        shape = default(shape, self.shape)  # Optional[(h, w)]
+        if shape is not None:
+            assert len(shape) == 2, f"Unhandeled shape {shape}"
+            inds = rearrange(inds, "b (h w) -> b h w", h=shape[0], w=shape[1])
+        h = self.regularization.get_codebook_entry(inds)  # (b, h, w, c)
+        h = rearrange(h, "b h w c -> b c h w")
+        return self.model.decode(h)
+class AutoencoderKLModeOnly(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        if "lossconfig" in kwargs:
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "models.svd.sgm.modules.autoencoding.regularizers"
+                    ".DiagonalGaussianRegularizer"
+                ),
+                "params": {"sample": False},
+            },
+            **kwargs,
+        )

models/svd/sgm/models/diffusion.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import math
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pytorch_lightning as pl
+import torch
+from omegaconf import ListConfig, OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+from models.svd.sgm.modules import UNCONDITIONAL_CONFIG
+from models.svd.sgm.modules.autoencoding.temporal_ae import VideoDecoder
+from models.svd.sgm.modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from models.svd.sgm.modules.ema import LitEma
+from models.svd.sgm.util import (default, disabled_train, get_obj_from_str,
+                    instantiate_from_config, log_txt_as_img)
+class DiffusionEngine(pl.LightningModule):
+    def __init__(
+        self,
+        network_config,
+        denoiser_config,
+        first_stage_config,
+        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        network_wrapper: Union[None, str] = None,
+        ckpt_path: Union[None, str] = None,
+        use_ema: bool = False,
+        ema_decay_rate: float = 0.9999,
+        scale_factor: float = 1.0,
+        disable_first_stage_autocast=False,
+        input_key: str = "jpg",
+        log_keys: Union[List, None] = None,
+        no_cond_log: bool = False,
+        compile_model: bool = False,
+        en_and_decode_n_samples_a_time: Optional[int] = None,
+    ):
+        super().__init__()
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
+            model, compile_model=compile_model
+        )
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+    def init_from_ckpt(
+        self,
+        path: str,
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        return batch[self.input_key]
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+        n_rounds = math.ceil(z.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
+                else:
+                    kwargs = {}
+                out = self.first_stage_model.decode(
+                    z[n * n_samples : (n + 1) * n_samples], **kwargs
+                )
+                all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        return out
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
+        n_rounds = math.ceil(x.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                out = self.first_stage_model.encode(
+                    x[n * n_samples : (n + 1) * n_samples]
+                )
+                all_out.append(out)
+        z = torch.cat(all_out, dim=0)
+        z = self.scale_factor * z
+        return z
+    def forward(self, x, batch):
+        loss = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch)
+        loss_mean = loss.mean()
+        loss_dict = {"loss": loss_mean}
+        return loss_mean, loss_dict
+    def shared_step(self, batch: Dict) -> Any:
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        self.log_dict(
+            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False
+        )
+        self.log(
+            "global_step",
+            self.global_step,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+        )
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log(
+                "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False
+            )
+        return loss
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                params = params + list(embedder.parameters())
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    "interval": "step",
+                    "frequency": 1,
+                }
+            ]
+            return [opt], scheduler
+        return opt
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        randn = torch.randn(batch_size, *shape).to(self.device)
+        denoiser = lambda input, sigma, c: self.denoiser(
+            self.model, input, sigma, c, **kwargs
+        )
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[2:]
+        log = dict()
+        for embedder in self.conditioner.embedders:
+            if (
+                (self.log_keys is None) or (embedder.input_key in self.log_keys)
+            ) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = [
+                            "x".join([str(xx) for xx in x[i].tolist()])
+                            for i in range(x.shape[0])
+                        ]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
+    @torch.no_grad()
+    def log_images(
+        self,
+        batch: Dict,
+        N: int = 8,
+        sample: bool = True,
+        ucg_keys: List[str] = None,
+        **kwargs,
+    ) -> Dict:
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+        x = self.get_input(batch)
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            force_uc_zero_embeddings=ucg_keys
+            if len(self.conditioner.embedders) > 0
+            else [],
+        )
+        sampling_kwargs = {}
+        N = min(x.shape[0], N)
+        x = x.to(self.device)[:N]
+        log["inputs"] = x
+        z = self.encode_first_stage(x)
+        log["reconstructions"] = self.decode_first_stage(z)
+        log.update(self.log_conditionings(batch, N))
+        for k in c:
+            if isinstance(c[k], torch.Tensor):
+                c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(
+                    c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs
+                )
+            samples = self.decode_first_stage(samples)
+            log["samples"] = samples
+        return log

models/svd/sgm/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from models.svd.sgm.modules.encoders.modules import GeneralConditioner
+UNCONDITIONAL_CONFIG = {
+    "target": "sgm.modules.GeneralConditioner",
+    "params": {"emb_models": []},
+}

models/svd/sgm/modules/attention.py ADDED Viewed

	@@ -0,0 +1,809 @@

+import logging
+import math
+from inspect import isfunction
+from typing import Any, Optional
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from packaging import version
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+logpy = logging.getLogger(__name__)
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    logpy.warn(
+        f"No SDP backend available, likely because you are running in pytorch "
+        f"versions < 2.0. In fact, you are using PyTorch {torch.__version__}. "
+        f"You might want to consider upgrading."
+    )
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    logpy.warn("no module 'xformers'. Processing without...")
+# from .diffusionmodules.util import mixed_checkpoint as checkpoint
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(
+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
+        )
+        return self.to_out(out)
+class SelfAttention(nn.Module):
+    ATTENTION_MODES = ("xformers", "torch", "math")
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_scale: Optional[float] = None,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        attn_mode: str = "xformers",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        assert attn_mode in self.ATTENTION_MODES
+        self.attn_mode = attn_mode
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, C = x.shape
+        qkv = self.qkv(x)
+        if self.attn_mode == "torch":
+            qkv = rearrange(
+                qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+            ).float()
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            x = rearrange(x, "B H L D -> B L (H D)")
+        elif self.attn_mode == "xformers":
+            qkv = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = rearrange(x, "B L H D -> B L (H D)", H=self.num_heads)
+        elif self.attn_mode == "math":
+            qkv = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplemented
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        backend=None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.backend = backend
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        h = self.heads
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        ## old
+        """
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        del q, k
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+        out = einsum('b i j, b j d -> b i d', sim, v)
+        """
+        ## new
+        with sdp_kernel(**BACKEND_MAP[self.backend]):
+            # print("dispatching into backend", self.backend, "q/k/v shape: ", q.shape, k.shape, v.shape)
+            out = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=mask
+            )  # scale is dim_head ** -0.5 per default
+        del q, k, v
+        out = rearrange(out, "b h n d -> b n (h d)", h=h)
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+class MemoryEfficientCrossAttention(nn.Module):
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(
+        self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs
+    ):
+        super().__init__()
+        logpy.debug(
+            f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, "
+            f"context_dim is {context_dim} and using {heads} heads with a "
+            f"dimension of {dim_head}."
+        )
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            # n_cp = x.shape[0]//n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        if version.parse(xformers.__version__) >= version.parse("0.0.21"):
+            # NOTE: workaround for
+            # https://github.com/facebookresearch/xformers/issues/845
+            max_bs = 32768
+            N = q.shape[0]
+            n_batches = math.ceil(N / max_bs)
+            out = list()
+            for i_batch in range(n_batches):
+                batch = slice(i_batch * max_bs, (i_batch + 1) * max_bs)
+                out.append(
+                    xformers.ops.memory_efficient_attention(
+                        q[batch],
+                        k[batch],
+                        v[batch],
+                        attn_bias=None,
+                        op=self.attention_op,
+                    )
+                )
+            out = torch.cat(out, 0)
+        else:
+            out = xformers.ops.memory_efficient_attention(
+                q, k, v, attn_bias=None, op=self.attention_op
+            )
+        # TODO: Use this directly in the attention operation, as a bias
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention,  # ampere
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        disable_self_attn=False,
+        attn_mode="softmax",
+        sdp_backend=None,
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
+            logpy.warn(
+                f"Attention mode '{attn_mode}' is not available. Falling "
+                f"back to native attention. This is not a problem in "
+                f"Pytorch >= 2.0. FYI, you are running with PyTorch "
+                f"version {torch.__version__}."
+            )
+            attn_mode = "softmax"
+        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
+            logpy.warn(
+                "We do not support vanilla attention anymore, as it is too "
+                "expensive. Sorry."
+            )
+            if not XFORMERS_IS_AVAILABLE:
+                assert (
+                    False
+                ), "Please install xformers via e.g. 'pip install xformers==0.0.16'"
+            else:
+                logpy.info("Falling back to xformers efficient attention.")
+                attn_mode = "softmax-xformers"
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
+        else:
+            assert sdp_backend is None
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None,
+            backend=sdp_backend,
+        )  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            backend=sdp_backend,
+        )  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+        if self.checkpoint:
+            logpy.debug(f"{self.__class__.__name__} is using checkpointing")
+    def forward(
+        self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        kwargs = {"x": x}
+        if context is not None:
+            kwargs.update({"context": context})
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+        if n_times_crossframe_attn_in_self:
+            kwargs.update(
+                {"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self}
+            )
+        # return mixed_checkpoint(self._forward, kwargs, self.parameters(), self.checkpoint)
+        if self.checkpoint:
+            # inputs = {"x": x, "context": context}
+            return checkpoint(self._forward, x, context)
+            # return checkpoint(self._forward, inputs, self.parameters(), self.checkpoint)
+        else:
+            return self._forward(**kwargs)
+    def _forward(
+        self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        x = (
+            self.attn1(
+                self.norm1(x),
+                context=context if self.disable_self_attn else None,
+                additional_tokens=additional_tokens,
+                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+                if not self.disable_self_attn
+                else 0,
+            )
+            + x
+        )
+        x = (
+            self.attn2(
+                self.norm2(x), context=context, additional_tokens=additional_tokens
+            )
+            + x
+        )
+        x = self.ff(self.norm3(x)) + x
+        return x
+class BasicTransformerBlockWithAPM(BasicTransformerBlock):
+    def __init__(self, dim, n_heads, d_head, dropout=0, context_dim=None, gated_ff=True, checkpoint=True, disable_self_attn=False, attn_mode="softmax", sdp_backend=None,use_apm=False):
+        super().__init__(dim, n_heads, d_head, dropout, context_dim, gated_ff, checkpoint, disable_self_attn, attn_mode, sdp_backend)
+        # APM Addition
+        assert disable_self_attn == False
+        self.use_apm = use_apm
+        if use_apm:
+            tokens_apm_clip = 16+1
+            self.apm_conv = torch.nn.Conv1d(
+                tokens_apm_clip, 1, kernel_size=3, padding="same")
+            channel_dim_context = 1024
+            self.apm_ln = nn.LayerNorm(channel_dim_context)
+            self.apm_alpha = nn.Parameter(torch.tensor(0.))
+    def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+        ):
+        if context is not None and context.shape[1]>1 and self.use_apm:
+            print("using APM CONTEXT !!!!")
+            context_svd = context[:,:1]
+            context_mixed = self.apm_conv(context)
+            context_mixed = self.apm_ln(context_mixed)
+            context = context_svd + context_mixed * F.silu(self.apm_alpha)
+        return super().forward(x=x,context=context,additional_tokens=additional_tokens,n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self)
+class BasicTransformerSingleLayerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # on the A100s not quite as fast as the above version
+        # (todo might depend on head_dim, check, falls back to semi-optimized kernels for dim!=[16,32,64,128])
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        attn_mode="softmax",
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim,
+        )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        # inputs = {"x": x, "context": context}
+        # return checkpoint(self._forward, inputs, self.parameters(), self.checkpoint)
+        return checkpoint(self._forward, x, context)
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context) + x
+        x = self.ff(self.norm2(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        disable_self_attn=False,
+        use_linear=False,
+        attn_type="softmax",
+        use_checkpoint=True,
+        # sdp_backend=SDPBackend.FLASH_ATTENTION
+        sdp_backend=None,
+        use_apm:bool =False,
+    ):
+        super().__init__()
+        logpy.debug(
+            f"constructing {self.__class__.__name__} of depth {depth} w/ "
+            f"{in_channels} channels and {n_heads} heads."
+        )
+        if exists(context_dim) and not isinstance(context_dim, list):
+            context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                logpy.warn(
+                    f"{self.__class__.__name__}: Found context dims "
+                    f"{context_dim} of depth {len(context_dim)}, which does not "
+                    f"match the specified 'depth' of {depth}. Setting context_dim "
+                    f"to {depth * [context_dim[0]]} now."
+                )
+                # depth does not match context dims.
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+            )
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        if use_apm:
+            print("APM TRANSFORMER BLOCK")
+            self.transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlockWithAPM(
+                        inner_dim,
+                        n_heads,
+                        d_head,
+                        dropout=dropout,
+                        context_dim=context_dim[d],
+                        disable_self_attn=disable_self_attn,
+                        attn_mode=attn_type,
+                        checkpoint=use_checkpoint,
+                        sdp_backend=sdp_backend,
+                        use_apm=use_apm,
+                    )
+                    for d in range(depth)
+                ]
+            )
+        else:
+            self.transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        inner_dim,
+                        n_heads,
+                        d_head,
+                        dropout=dropout,
+                        context_dim=context_dim[d],
+                        disable_self_attn=disable_self_attn,
+                        attn_mode=attn_type,
+                        checkpoint=use_checkpoint,
+                        sdp_backend=sdp_backend,
+                    )
+                    for d in range(depth)
+                ]
+            )
+        if not use_linear:
+            self.proj_out = zero_module(
+                nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+            )
+        else:
+            # self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        if not isinstance(context, list):
+            context = [context]
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c").contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
+            x = block(x, context=context[i])
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+class SimpleTransformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        context_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        checkpoint: bool = True,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                BasicTransformerBlock(
+                    dim,
+                    heads,
+                    dim_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    attn_mode="softmax-xformers",
+                    checkpoint=checkpoint,
+                )
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, context)
+        return x

models/svd/sgm/modules/autoencoding/__init__.py ADDED Viewed

File without changes