Spaces:

ai-forever
/

kandinsky-4-t2v-flash

Running on Zero

App Files Files Community

ai-forever commited on Dec 10, 2024

Commit

9d3c2b7

1 Parent(s): 3839d6c

add files

Browse files

Files changed (22) hide show

app.py +121 -147
assets/LADD.png +0 -0
assets/MMDiT1.png +0 -0
assets/MMDiT_block1.png +0 -0
assets/discriminator.png +0 -0
assets/discriminator_head.png +0 -0
assets/pipeline.png +0 -0
kandinsky/.DS_Store +0 -0
kandinsky/__init__.py +157 -0
kandinsky/model/__init__.py +0 -0
kandinsky/model/__pycache__/__init__.cpython-311.pyc +0 -0
kandinsky/model/__pycache__/dit.cpython-311.pyc +0 -0
kandinsky/model/__pycache__/dit_i2v.cpython-311.pyc +0 -0
kandinsky/model/__pycache__/nn.cpython-311.pyc +0 -0
kandinsky/model/__pycache__/nn_i2v.cpython-311.pyc +0 -0
kandinsky/model/__pycache__/text_embedders.cpython-311.pyc +0 -0
kandinsky/model/__pycache__/utils.cpython-311.pyc +0 -0
kandinsky/model/dit.py +201 -0
kandinsky/model/nn.py +292 -0
kandinsky/model/text_embedders.py +62 -0
kandinsky/model/utils.py +107 -0
kandinsky/t2v_pipeline.py +201 -0

app.py CHANGED Viewed

@@ -1,154 +1,128 @@
 import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
 import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
 ):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
-    )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import spaces
+#import gradio.helpers
 import torch
+import os
+from glob import glob
+from pathlib import Path
+from typing import Optional
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.utils import load_image, export_to_video
+from PIL import Image
+import uuid
+import random
+from huggingface_hub import hf_hub_download
+#gradio.helpers.CACHED_FOLDER = '/data/cache'
+pipe = StableVideoDiffusionPipeline.from_pretrained(
+    "multimodalart/stable-video-diffusion", torch_dtype=torch.float16, variant="fp16"
+)
+pipe.to("cuda")
+#pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+#pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
+max_64_bit_int = 2**63 - 1
+@spaces.GPU(duration=120)
+def sample(
+    image: Image,
+    seed: Optional[int] = 42,
+    randomize_seed: bool = True,
+    motion_bucket_id: int = 127,
+    fps_id: int = 6,
+    version: str = "svd_xt",
+    cond_aug: float = 0.02,
+    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
+    device: str = "cuda",
+    output_folder: str = "outputs",
+    progress=gr.Progress(track_tqdm=True)
 ):
+    if image.mode == "RGBA":
+        image = image.convert("RGB")
+    if(randomize_seed):
+        seed = random.randint(0, max_64_bit_int)
+    generator = torch.manual_seed(seed)
+    os.makedirs(output_folder, exist_ok=True)
+    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]
+    export_to_video(frames, video_path, fps=fps_id)
+    torch.manual_seed(seed)
+    return video_path, seed
+def resize_image(image, output_size=(1024, 576)):
+    # Calculate aspect ratios
+    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
+    image_aspect = image.width / image.height  # Aspect ratio of the original image
+    # Resize then crop if the original image is larger
+    if image_aspect > target_aspect:
+        # Resize the image to match the target height, maintaining aspect ratio
+        new_height = output_size[1]
+        new_width = int(new_height * image_aspect)
+        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
+        # Calculate coordinates for cropping
+        left = (new_width - output_size[0]) / 2
+        top = 0
+        right = (new_width + output_size[0]) / 2
+        bottom = output_size[1]
+    else:
+        # Resize the image to match the target width, maintaining aspect ratio
+        new_width = output_size[0]
+        new_height = int(new_width / image_aspect)
+        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
+        # Calculate coordinates for cropping
+        left = 0
+        top = (new_height - output_size[1]) / 2
+        right = output_size[0]
+        bottom = (new_height + output_size[1]) / 2
+    # Crop the image
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return cropped_image
+with gr.Blocks() as demo:
+  gr.Markdown('''# Community demo for Stable Video Diffusion - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact))
+#### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [🧨 diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
+  ''')
+  with gr.Row():
+    with gr.Column():
+        image = gr.Image(label="Upload your image", type="pil")
+        generate_btn = gr.Button("Generate")
+    video = gr.Video()
+  with gr.Accordion("Advanced options", open=False):
+      seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
+      randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+      motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
+      fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
+  image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
+  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video")
+  gr.Examples(
+    examples=[
+        "images/blink_meme.png",
+        "images/confused2_meme.png",
+        "images/disaster_meme.png",
+        "images/distracted_meme.png",
+        "images/hide_meme.png",
+        "images/nazare_meme.png",
+        "images/success_meme.png",
+        "images/willy_meme.png",
+        "images/wink_meme.png"
+    ],
+    inputs=image,
+    outputs=[video, seed],
+    fn=sample,
+    cache_examples="lazy",
+  )
 if __name__ == "__main__":
+    #demo.queue(max_size=20, api_open=False)
+    demo.launch(share=True, show_api=False)

assets/LADD.png ADDED Viewed

assets/MMDiT1.png ADDED Viewed

assets/MMDiT_block1.png ADDED Viewed

assets/discriminator.png ADDED Viewed

assets/discriminator_head.png ADDED Viewed

assets/pipeline.png ADDED Viewed

kandinsky/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

kandinsky/__init__.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+from typing import Optional, Union
+import torch
+from omegaconf import OmegaConf
+from .model.dit import get_dit, parallelize
+from .model.text_embedders import get_text_embedder
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDDIMScheduler
+from omegaconf.dictconfig import DictConfig
+from huggingface_hub import hf_hub_download, snapshot_download
+from .t2v_pipeline import Kandinsky4T2VPipeline
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+def get_T2V_pipeline(
+        device_map: Union[str, torch.device, dict],
+        resolution: int = 512,
+        cache_dir: str = './weights/',
+        dit_path: str = None,
+        text_encoder_path: str = None,
+        tokenizer_path: str = None,
+        vae_path: str = None,
+        scheduler_path: str = None,
+        conf_path: str = None,
+) -> Kandinsky4T2VPipeline:
+    assert resolution in [512]
+    if not isinstance(device_map, dict):
+        device_map = {
+            'dit': device_map,
+            'vae': device_map,
+            'text_embedder': device_map
+        }
+    try:
+        local_rank, world_size = int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"])
+    except:
+        local_rank, world_size = 0, 1
+    if world_size > 1:
+        device_mesh = init_device_mesh("cuda", (world_size,), mesh_dim_names=("tensor_parallel",))
+        device_map["dit"] = torch.device(f'cuda:{local_rank}')
+    os.makedirs(cache_dir, exist_ok=True)
+    if dit_path is None:
+        dit_path = hf_hub_download(
+            repo_id="ai-forever/kandinsky4", filename=f"kandinsky4_distil_{resolution}.pt", local_dir=cache_dir
+        )
+    if vae_path is None:
+        vae_path = snapshot_download(
+            repo_id="THUDM/CogVideoX-5b", allow_patterns='vae/*', local_dir=cache_dir
+        )
+        vae_path = os.path.join(cache_dir, f"vae/")
+    if scheduler_path is None:
+        scheduler_path = snapshot_download(
+            repo_id="THUDM/CogVideoX-5b", allow_patterns='scheduler/*', local_dir=cache_dir
+        )
+        scheduler_path = os.path.join(cache_dir, f"scheduler/")
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="THUDM/CogVideoX-5b", allow_patterns='text_encoder/*', local_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(cache_dir, f"text_encoder/")
+    if tokenizer_path is None:
+        tokenizer_path = snapshot_download(
+            repo_id="THUDM/CogVideoX-5b", allow_patterns='tokenizer/*', local_dir=cache_dir
+        )
+        tokenizer_path = os.path.join(cache_dir, f"tokenizer/")
+    if conf_path is None:
+        conf = get_default_conf(vae_path, text_encoder_path, tokenizer_path, scheduler_path, dit_path)
+    else:
+        conf = OmegaConf.load(conf_path)
+    dit = get_dit(conf.dit)
+    dit = dit.to(dtype=torch.bfloat16, device=device_map["dit"])
+    noise_scheduler = CogVideoXDDIMScheduler.from_pretrained(conf.dit.scheduler)
+    if world_size > 1:
+        dit = parallelize(dit, device_mesh["tensor_parallel"])
+    text_embedder = get_text_embedder(conf)
+    text_embedder = text_embedder.freeze()
+    if local_rank == 0:
+        text_embedder = text_embedder.to(device=device_map["text_embedder"], dtype=torch.bfloat16)
+    vae = AutoencoderKLCogVideoX.from_pretrained(conf.vae.checkpoint_path)
+    vae = vae.eval()
+    if local_rank == 0:
+        vae = vae.to(device_map["vae"], dtype=torch.bfloat16)
+    return Kandinsky4T2VPipeline(
+        device_map=device_map,
+        dit=dit,
+        text_embedder=text_embedder,
+        vae=vae,
+        noise_scheduler=noise_scheduler,
+        resolution=resolution,
+        local_dit_rank=local_rank,
+        world_size=world_size,
+    )
+def get_default_conf(
+    vae_path,
+    text_encoder_path,
+    tokenizer_path,
+    scheduler_path,
+    dit_path,
+) -> DictConfig:
+    dit_params = {
+            'in_visual_dim': 16,
+            'in_text_dim': 4096,
+            'out_visual_dim': 16,
+            'time_dim': 512,
+            'patch_size': [1, 2, 2],
+            'model_dim': 3072,
+            'ff_dim': 12288,
+            'num_blocks': 21,
+            'axes_dims': [16, 24, 24]
+        }
+    conf = {
+        'vae':
+            {
+                'checkpoint_path': vae_path
+            },
+        'text_embedder':
+            {
+                'emb_size': 4096,
+                'tokens_lenght': 224,
+                'params':
+                    {
+                        'checkpoint_path': text_encoder_path,
+                        'tokenizer_path': tokenizer_path
+                    }
+            },
+        'dit':
+            {
+                'scheduler': scheduler_path,
+                'checkpoint_path': dit_path,
+                'params': dit_params
+            },
+        'resolution': 512,
+    }
+    return DictConfig(conf)

kandinsky/model/__init__.py ADDED Viewed

File without changes

kandinsky/model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (171 Bytes). View file

kandinsky/model/__pycache__/dit.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

kandinsky/model/__pycache__/dit_i2v.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

kandinsky/model/__pycache__/nn.cpython-311.pyc ADDED Viewed

Binary file (24.9 kB). View file

kandinsky/model/__pycache__/nn_i2v.cpython-311.pyc ADDED Viewed

Binary file (7.18 kB). View file

kandinsky/model/__pycache__/text_embedders.cpython-311.pyc ADDED Viewed

Binary file (4.14 kB). View file

kandinsky/model/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (8.02 kB). View file

kandinsky/model/dit.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+from diffusers import CogVideoXDDIMScheduler
+from .nn import TimeEmbeddings, TextEmbeddings, VisualEmbeddings, RoPE3D, Modulation, MultiheadSelfAttention, MultiheadSelfAttentionTP, FeedForward, OutLayer
+from .utils import exist
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+    SequenceParallel,
+    parallelize_module,
+)
+from torch.distributed._tensor import Replicate, Shard
+def parallelize(model, tp_mesh):
+    if tp_mesh.size() > 1:
+        plan = {
+            "in_layer":ColwiseParallel(),
+            "out_layer": RowwiseParallel(
+                 output_layouts=Replicate(),
+            )
+        }
+        parallelize_module(model.time_embeddings, tp_mesh, plan)
+        plan = {
+            "in_layer": ColwiseParallel(output_layouts=Replicate(),)
+        }
+        parallelize_module(model.text_embeddings, tp_mesh, plan)
+        parallelize_module(model.visual_embeddings, tp_mesh, plan)
+        for i, doubled_transformer_block in enumerate(model.transformer_blocks):
+            for j, transformer_block in enumerate(doubled_transformer_block):
+                transformer_block.self_attention = MultiheadSelfAttentionTP(transformer_block.self_attention)
+                plan = {
+                    #text modulation
+                    "text_modulation":  PrepareModuleInput(
+                        input_layouts=(None, None),
+                        desired_input_layouts=(Replicate(), None),
+                    ),
+                    "text_modulation.out_layer": ColwiseParallel(output_layouts=Replicate(),),
+                    #visual modulation
+                    "visual_modulation":  PrepareModuleInput(
+                        input_layouts=(None, None),
+                        desired_input_layouts=(Replicate(), None),
+                    ),
+                    "visual_modulation.out_layer": ColwiseParallel(output_layouts=Replicate(), use_local_output=True),
+                    #self_attention_norm
+                    "self_attention_norm": SequenceParallel(sequence_dim=0, use_local_output=True), # TODO надо ли вообще это??? если у нас смешанный ввод нескольких видосом может быть
+                    #self_attention
+                    "self_attention.to_query":  ColwiseParallel(
+                        input_layouts=Replicate(),
+                    ),
+                    "self_attention.to_key":  ColwiseParallel(
+                        input_layouts=Replicate(),
+                    ),
+                    "self_attention.to_value":  ColwiseParallel(
+                        input_layouts=Replicate(),
+                    ),
+                    "self_attention.query_norm": SequenceParallel(sequence_dim=0, use_local_output=True),
+                    "self_attention.key_norm": SequenceParallel(sequence_dim=0, use_local_output=True),
+                    "self_attention.output_layer": RowwiseParallel(
+                        # input_layouts=(Shard(0), ),
+                        output_layouts=Replicate(),
+                    ),
+                    #feed_forward_norm
+                    "feed_forward_norm": SequenceParallel(sequence_dim=0, use_local_output=True),
+                    #feed_forward
+                    "feed_forward.in_layer": ColwiseParallel(),
+                    "feed_forward.out_layer": RowwiseParallel(),
+                }
+                self_attn = transformer_block.self_attention
+                self_attn.num_heads = self_attn.num_heads // tp_mesh.size()
+                parallelize_module(transformer_block, tp_mesh, plan)
+        plan = {
+                "modulation_out":ColwiseParallel(output_layouts=Replicate(),),
+                "out_layer": ColwiseParallel(output_layouts=Replicate(),),
+            }
+        parallelize_module(model.out_layer, tp_mesh, plan)
+        plan={
+            "time_embeddings": PrepareModuleInput(desired_input_layouts=Replicate(),),
+            "text_embeddings": PrepareModuleInput(desired_input_layouts=Replicate(),),
+            "visual_embeddings": PrepareModuleInput(desired_input_layouts=Replicate(),),
+            "out_layer": PrepareModuleInput(
+                input_layouts=(None, None, None, None),
+                desired_input_layouts=(Replicate(), Replicate(), Replicate(), None)),
+        }
+        parallelize_module(model, tp_mesh, {})
+    return model
+class TransformerBlock(nn.Module):
+    def __init__(self, model_dim, time_dim, ff_dim, head_dim=64):
+        super().__init__()
+        self.visual_modulation = Modulation(time_dim, model_dim)
+        self.text_modulation = Modulation(time_dim, model_dim)
+        self.self_attention_norm = nn.LayerNorm(model_dim, elementwise_affine=True)
+        self.self_attention = MultiheadSelfAttention(model_dim, head_dim)
+        self.feed_forward_norm = nn.LayerNorm(model_dim, elementwise_affine=True)
+        self.feed_forward = FeedForward(model_dim, ff_dim)
+    def forward(self, visual_embed, text_embed, time_embed, rope, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type):
+        visual_shape = visual_embed.shape[:-1]
+        visual_self_attn_params, visual_ff_params = self.visual_modulation(time_embed, visual_cu_seqlens)
+        text_self_attn_params, text_ff_params = self.text_modulation(time_embed, text_cu_seqlens)
+        visual_shift, visual_scale, visual_gate = torch.chunk(visual_self_attn_params, 3, dim=-1)
+        text_shift, text_scale, text_gate = torch.chunk(text_self_attn_params, 3, dim=-1)
+        visual_out = self.self_attention_norm(visual_embed) * (visual_scale[:, None, None] + 1.) + visual_shift[:, None, None]
+        text_out = self.self_attention_norm(text_embed) * (text_scale + 1.) + text_shift
+        visual_out, text_out = self.self_attention(visual_out, text_out, rope, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type)
+        visual_embed = visual_embed + visual_gate[:, None, None] * visual_out
+        text_embed = text_embed + text_gate * text_out
+        visual_shift, visual_scale, visual_gate = torch.chunk(visual_ff_params, 3, dim=-1)
+        visual_out = self.feed_forward_norm(visual_embed) * (visual_scale[:, None, None] + 1.) + visual_shift[:, None, None]
+        visual_embed = visual_embed + visual_gate[:, None, None] * self.feed_forward(visual_out)
+        text_shift, text_scale, text_gate = torch.chunk(text_ff_params, 3, dim=-1)
+        text_out = self.feed_forward_norm(text_embed) * (text_scale + 1.) + text_shift
+        text_embed = text_embed + text_gate * self.feed_forward(text_out)
+        return visual_embed, text_embed
+class DiffusionTransformer3D(nn.Module):
+    def __init__(
+        self,
+        in_visual_dim=4,
+        in_text_dim=2048,
+        time_dim=512,
+        out_visual_dim=4,
+        patch_size=(1, 2, 2),
+        model_dim=2048,
+        ff_dim=5120,
+        num_blocks=8,
+        axes_dims=(16, 24, 24),
+    ):
+        super().__init__()
+        head_dim = sum(axes_dims)
+        self.in_visual_dim = in_visual_dim
+        self.model_dim = model_dim
+        self.num_blocks = num_blocks
+        self.time_embeddings = TimeEmbeddings(model_dim, time_dim)
+        self.text_embeddings = TextEmbeddings(in_text_dim, model_dim)
+        self.visual_embeddings = VisualEmbeddings(in_visual_dim, model_dim, patch_size)
+        self.rope_embeddings = RoPE3D(axes_dims)
+        self.transformer_blocks = nn.ModuleList([
+            nn.ModuleList([
+                TransformerBlock(model_dim, time_dim, ff_dim, head_dim),
+                TransformerBlock(model_dim, time_dim, ff_dim, head_dim),
+            ]) for _ in range(num_blocks)
+        ])
+        self.out_layer = OutLayer(model_dim, time_dim, out_visual_dim, patch_size)
+    def forward(self, x, text_embed, time, visual_cu_seqlens, text_cu_seqlens, num_groups=(1, 1, 1), scale_factor=(1., 1., 1.)):
+        time_embed = self.time_embeddings(time)
+        text_embed = self.text_embeddings(text_embed)
+        visual_embed = self.visual_embeddings(x)
+        rope = self.rope_embeddings(visual_embed, visual_cu_seqlens, scale_factor)
+        for i, (local_attention, global_attention) in enumerate(self.transformer_blocks):
+            visual_embed, text_embed = local_attention(
+                visual_embed, text_embed, time_embed, rope, visual_cu_seqlens, text_cu_seqlens, num_groups, 'local'
+            )
+            visual_embed, text_embed = global_attention(
+                visual_embed, text_embed, time_embed, rope, visual_cu_seqlens, text_cu_seqlens, num_groups, 'global'
+            )
+        return self.out_layer(visual_embed, text_embed, time_embed, visual_cu_seqlens)
+def get_dit(conf):
+    dit = DiffusionTransformer3D(**conf.params)
+    state_dict = torch.load(conf.checkpoint_path, weights_only=True, map_location=torch.device('cpu'))
+    dit.load_state_dict(state_dict, strict=False)
+    return dit

kandinsky/model/nn.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import time
+import math
+import torch
+from torch import nn
+from flash_attn import flash_attn_varlen_qkvpacked_func
+from .utils import exist, get_freqs, cat_interleave, split_interleave, to_1dimension, to_3dimension
+def apply_rotary(x, rope):
+    x_ = x.reshape(*x.shape[:-1], -1, 1, 2).to(torch.float32)
+    x_out = rope[..., 0] * x_[..., 0] + rope[..., 1] * x_[..., 1]
+    return x_out.reshape(*x.shape)
+class TimeEmbeddings(nn.Module):
+    def __init__(self, model_dim, time_dim, max_period=10000.):
+        super().__init__()
+        assert model_dim % 2 == 0
+        self.freqs = get_freqs(model_dim // 2, max_period)
+        self.in_layer = nn.Linear(model_dim, time_dim, bias=True)
+        self.activation = nn.SiLU()
+        self.out_layer = nn.Linear(time_dim, time_dim, bias=True)
+    def forward(self, time):
+        args = torch.outer(time, self.freqs.to(device=time.device))
+        time_embed = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return self.out_layer(self.activation(self.in_layer(time_embed)))
+class TextEmbeddings(nn.Module):
+    def __init__(self, text_dim, model_dim):
+        super().__init__()
+        self.in_layer = nn.Linear(text_dim, model_dim, bias=True)
+    def forward(self, text_embed):
+        return self.in_layer(text_embed)
+class VisualEmbeddings(nn.Module):
+    def __init__(self, visual_dim, model_dim, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_layer = nn.Linear(math.prod(patch_size) * visual_dim, model_dim)
+    def forward(self, x):
+        duration, height, width, dim = x.shape
+        x = x.view(
+            duration // self.patch_size[0], self.patch_size[0],
+            height // self.patch_size[1], self.patch_size[1],
+            width // self.patch_size[2], self.patch_size[2], dim
+        ).permute(0, 2, 4, 1, 3, 5, 6).flatten(3, 6)
+        return self.in_layer(x)
+class RoPE3D(nn.Module):
+    def __init__(self, axes_dims, max_pos=(128, 128, 128), max_period=10000.):
+        super().__init__()
+        for i, (axes_dim, ax_max_pos) in enumerate(zip(axes_dims, max_pos)):
+            freq = get_freqs(axes_dim // 2, max_period)
+            pos = torch.arange(ax_max_pos, dtype=freq.dtype)
+            self.register_buffer(f'args_{i}', torch.outer(pos, freq))
+    def args(self, i, cu_seqlens):
+        args = self.__getattr__(f'args_{i}')
+        if torch.is_tensor(cu_seqlens):
+            args = torch.cat([args[:end] for end in torch.diff(cu_seqlens)])
+        else:
+            args = args[:cu_seqlens]
+        return args
+    def forward(self, x, cu_seqlens, scale_factor=(1., 1., 1.)):
+        duration, height, width = x.shape[:-1]
+        args = [
+            self.args(i, ax_cu_seqlens) / ax_scale_factor
+            for i, (ax_cu_seqlens, ax_scale_factor) in enumerate(zip([cu_seqlens, height, width], scale_factor))
+        ]
+        args = torch.cat([
+            args[0].view(duration, 1, 1, -1).repeat(1, height, width, 1),
+            args[1].view(1, height, 1, -1).repeat(duration, 1, width, 1),
+            args[2].view(1, 1, width, -1).repeat(duration, height, 1, 1)
+        ], dim=-1)
+        rope = torch.stack([torch.cos(args), -torch.sin(args), torch.sin(args), torch.cos(args)], dim=-1)
+        rope = rope.view(*rope.shape[:-1], 2, 2)
+        return rope.unsqueeze(-4)
+class Modulation(nn.Module):
+    def __init__(self, time_dim, model_dim):
+        super().__init__()
+        self.activation = nn.SiLU()
+        self.out_layer = nn.Linear(time_dim, 6 * model_dim)
+        self.out_layer.weight.data.zero_()
+        self.out_layer.bias.data.zero_()
+    def forward(self, x, cu_seqlens):
+        modulation_params = self.out_layer(self.activation(x))
+        modulation_params = modulation_params.repeat_interleave(torch.diff(cu_seqlens), dim=0)
+        self_attn_params, ff_params = torch.chunk(modulation_params, 2, dim=-1)
+        return self_attn_params, ff_params
+class MultiheadSelfAttention(nn.Module):
+    def __init__(self, num_channels, head_dim=64, attention_type='flash'):
+        super().__init__()
+        assert num_channels % head_dim == 0
+        self.attention_type = attention_type
+        self.num_heads = num_channels // head_dim
+        self.to_query_key_value = nn.Linear(num_channels, 3 * num_channels, bias=True)
+        self.query_norm = nn.LayerNorm(head_dim)
+        self.key_norm = nn.LayerNorm(head_dim)
+        self.output_layer = nn.Linear(num_channels, num_channels, bias=True)
+    def scaled_dot_product_attention(
+        self, visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type,
+        return_attn_probs=False
+    ):
+        if self.attention_type == 'flash':
+            visual_shape, text_len = visual_query_key_value.shape[:3], text_cu_seqlens[1]
+            visual_query_key_value, visual_cu_seqlens = to_1dimension(
+                visual_query_key_value, visual_cu_seqlens, visual_shape, num_groups, attention_type
+            )
+            text_query_key_value = text_query_key_value.unsqueeze(0).expand(math.prod(num_groups), *text_query_key_value.size())
+            query_key_value = cat_interleave(visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens)
+            cu_seqlens = visual_cu_seqlens + text_cu_seqlens
+            max_seqlen = torch.diff(cu_seqlens).max()
+            query_key_value = query_key_value.flatten(0, 1)
+            large_cu_seqlens = torch.cat([cu_seqlens + i * cu_seqlens[-1] for i in range(math.prod(num_groups))])
+            out, softmax_lse, _ = flash_attn_varlen_qkvpacked_func(query_key_value, large_cu_seqlens, max_seqlen, return_attn_probs=True)
+            out = out.reshape(math.prod(num_groups), -1, *out.shape[1:]).flatten(-2, -1)
+            visual_out, text_out = split_interleave(out, cu_seqlens, text_len)
+            visual_out = to_3dimension(visual_out, visual_shape, num_groups, attention_type)
+            if return_attn_probs:
+                return (visual_out, text_out), softmax_lse, None
+            return visual_out, text_out
+    def forward(self, visual_embed, text_embed, rope, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type):
+        visual_shape = visual_embed.shape[:-1]
+        visual_query_key_value = self.to_query_key_value(visual_embed)
+        visual_query, visual_key, visual_value = torch.chunk(visual_query_key_value, 3, dim=-1)
+        visual_query = self.query_norm(visual_query.reshape(*visual_shape, self.num_heads, -1)).type_as(visual_query)
+        visual_key = self.key_norm(visual_key.reshape(*visual_shape, self.num_heads, -1)).type_as(visual_key)
+        visual_value = visual_value.reshape(*visual_shape, self.num_heads, -1)
+        visual_query = apply_rotary(visual_query, rope).type_as(visual_query)
+        visual_key = apply_rotary(visual_key, rope).type_as(visual_key)
+        visual_query_key_value = torch.stack([visual_query, visual_key, visual_value], dim=3)
+        text_len = text_embed.shape[0]
+        text_query_key_value = self.to_query_key_value(text_embed)
+        text_query, text_key, text_value = torch.chunk(text_query_key_value, 3, dim=-1)
+        text_query = self.query_norm(text_query.reshape(text_len, self.num_heads, -1)).type_as(text_query)
+        text_key = self.key_norm(text_key.reshape(text_len, self.num_heads, -1)).type_as(text_key)
+        text_value = text_value.reshape(text_len, self.num_heads, -1)
+        text_query_key_value = torch.stack([text_query, text_key, text_value], dim=1)
+        visual_out, text_out = self.scaled_dot_product_attention(
+            visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type
+        )
+        visual_out = self.output_layer(visual_out)
+        text_out = self.output_layer(text_out)
+        return visual_out, text_out
+class MultiheadSelfAttentionTP(nn.Module):
+    def __init__(self, initial_multihead_self_attention):
+        super().__init__()
+        num_channels = initial_multihead_self_attention.to_query_key_value.weight.shape[1]
+        self.num_heads = initial_multihead_self_attention.num_heads
+        head_dim = num_channels // self.num_heads
+        self.attention_type = initial_multihead_self_attention.attention_type
+        self.to_query = nn.Linear(num_channels, num_channels, bias=True)
+        self.to_key = nn.Linear(num_channels, num_channels, bias=True)
+        self.to_value = nn.Linear(num_channels, num_channels, bias=True)
+        weight = initial_multihead_self_attention.to_query_key_value.weight
+        bias = initial_multihead_self_attention.to_query_key_value.bias
+        self.to_query.weight = torch.nn.Parameter(weight[:num_channels])
+        self.to_key.weight = torch.nn.Parameter(weight[num_channels:2 * num_channels])
+        self.to_value.weight = torch.nn.Parameter(weight[2 * num_channels:])
+        self.to_query.bias = torch.nn.Parameter(bias[:num_channels])
+        self.to_key.bias = torch.nn.Parameter(bias[num_channels:2 * num_channels])
+        self.to_value.bias = torch.nn.Parameter(bias[2 * num_channels:])
+        self.query_norm = initial_multihead_self_attention.query_norm
+        self.key_norm = initial_multihead_self_attention.key_norm
+        self.output_layer = initial_multihead_self_attention.output_layer
+    def scaled_dot_product_attention(
+        self, visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type,
+        return_attn_probs=False
+    ):
+        if self.attention_type == 'flash':
+            visual_shape, text_len = visual_query_key_value.shape[:3], text_cu_seqlens[1]
+            visual_query_key_value, visual_cu_seqlens = to_1dimension(
+                visual_query_key_value, visual_cu_seqlens, visual_shape, num_groups, attention_type
+            )
+            text_query_key_value = text_query_key_value.unsqueeze(0).expand(math.prod(num_groups), *text_query_key_value.size())
+            query_key_value = cat_interleave(visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens)
+            cu_seqlens = visual_cu_seqlens + text_cu_seqlens
+            max_seqlen = torch.diff(cu_seqlens).max()
+            query_key_value = query_key_value.flatten(0, 1)
+            large_cu_seqlens = torch.cat([cu_seqlens + i * cu_seqlens[-1] for i in range(math.prod(num_groups))])
+            out, softmax_lse, _ = flash_attn_varlen_qkvpacked_func(query_key_value, large_cu_seqlens, max_seqlen, return_attn_probs=True)
+            out = out.reshape(math.prod(num_groups), -1, *out.shape[1:]).flatten(-2, -1)
+            visual_out, text_out = split_interleave(out, cu_seqlens, text_len)
+            visual_out = to_3dimension(visual_out, visual_shape, num_groups, attention_type)
+            if return_attn_probs:
+                return (visual_out, text_out), softmax_lse, None
+            return visual_out, text_out
+    def forward(self, visual_embed, text_embed, rope, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type):
+        visual_shape = visual_embed.shape[:-1]
+        visual_query, visual_key, visual_value = self.to_query(visual_embed), self.to_key(visual_embed), self.to_value(visual_embed)
+        visual_query = self.query_norm(visual_query.reshape(*visual_shape, self.num_heads, -1)).type_as(visual_query)
+        visual_key = self.key_norm(visual_key.reshape(*visual_shape, self.num_heads, -1)).type_as(visual_key)
+        visual_value = visual_value.reshape(*visual_shape, self.num_heads, -1)
+        visual_query = apply_rotary(visual_query, rope).type_as(visual_query)
+        visual_key = apply_rotary(visual_key, rope).type_as(visual_key)
+        visual_query_key_value = torch.stack([visual_query, visual_key, visual_value], dim=3)
+        text_len = text_embed.shape[0]
+        text_query, text_key, text_value = self.to_query(text_embed), self.to_key(text_embed), self.to_value(text_embed)
+        text_query = self.query_norm(text_query.reshape(text_len, self.num_heads, -1)).type_as(text_query)
+        text_key = self.key_norm(text_key.reshape(text_len, self.num_heads, -1)).type_as(text_key)
+        text_value = text_value.reshape(text_len, self.num_heads, -1)
+        text_query_key_value = torch.stack([text_query, text_key, text_value], dim=1)
+        visual_out, text_out = self.scaled_dot_product_attention(
+            visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens, num_groups, attention_type
+        )
+        visual_out = self.output_layer(visual_out)
+        text_out = self.output_layer(text_out)
+        return visual_out, text_out
+class FeedForward(nn.Module):
+    def __init__(self, dim, ff_dim):
+        super().__init__()
+        self.in_layer = nn.Linear(dim, ff_dim, bias=True)
+        self.activation = nn.GELU()
+        self.out_layer = nn.Linear(ff_dim, dim, bias=True)
+    def forward(self, x):
+        return self.out_layer(self.activation(self.in_layer(x)))
+class OutLayer(nn.Module):
+    def __init__(self, model_dim, time_dim, visual_dim, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+        self.norm =  nn.LayerNorm(model_dim, elementwise_affine=True)
+        self.out_layer = nn.Linear(model_dim, math.prod(patch_size) * visual_dim, bias=True)
+        self.modulation_activation = nn.SiLU()
+        self.modulation_out = nn.Linear(time_dim, 2 * model_dim, bias=True)
+        self.modulation_out.weight.data.zero_()
+        self.modulation_out.bias.data.zero_()
+    def forward(self, visual_embed, text_embed, time_embed, visual_cu_seqlens):
+        modulation_params = self.modulation_out(self.modulation_activation(time_embed))
+        modulation_params = modulation_params.repeat_interleave(torch.diff(visual_cu_seqlens), dim=0)
+        shift, scale = torch.chunk(modulation_params, 2, dim=-1)
+        visual_embed = self.norm(visual_embed) * (scale[:, None, None, :] + 1) + shift[:, None, None, :]
+        x = self.out_layer(visual_embed)
+        duration, height, width, dim = x.shape
+        x = x.view(
+            duration, height, width,
+            -1, self.patch_size[0], self.patch_size[1], self.patch_size[2]
+        ).permute(0, 4, 1, 5, 2, 6, 3).flatten(0, 1).flatten(1, 2).flatten(2, 3)
+        return x

kandinsky/model/text_embedders.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import numpy as np
+import sys
+import os
+from .utils import freeze
+class BaseEmbedder:
+    def __init__(self, conf):
+        self.checkpoint_path = conf.text_embedder.params.checkpoint_path
+        self.tokenizer_path = conf.text_embedder.params.tokenizer_path
+        self.max_length = conf.text_embedder.tokens_lenght
+        self.llm = None
+    def to(self, device='cpu', dtype=torch.float32):
+        self.llm = self.llm.to(device=device, dtype=dtype)
+        return self
+    def freeze(self):
+        self.llm = freeze(self.llm)
+        return self
+    def compile(self):
+        self.llm = torch.compile(self.llm)
+        return self
+class EmbedderWithTokenizer(BaseEmbedder):
+    def __init__(self, conf):
+        super().__init__(conf)
+        self.tokenizer = None
+    def tokenize(self, text):
+        model_input = self.tokenizer(
+            text,
+            max_length=self.max_length,
+            truncation=True,
+            add_special_tokens=True,
+            padding='max_length',
+            return_tensors='pt'
+        )
+        return model_input.input_ids.to(self.llm.device)
+    def __call__(self, text):
+        return self.llm(self.tokenize(text), output_hidden_states=True)[0]
+class T5TextEmbedder(EmbedderWithTokenizer):
+    def __init__(self, conf):
+        from transformers import T5EncoderModel, T5Tokenizer
+        super().__init__(conf)
+        self.llm = T5EncoderModel.from_pretrained(self.checkpoint_path)
+        self.tokenizer = T5Tokenizer.from_pretrained(self.tokenizer_path, clean_up_tokenization_spaces=False)
+def get_text_embedder(conf):
+    return T5TextEmbedder(conf)

kandinsky/model/utils.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import math
+import torch
+def exist(item):
+    return item is not None
+def freeze(model):
+    for p in model.parameters():
+        p.requires_grad = False
+    return model
+def get_freqs(dim, max_period=10000.):
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=dim, dtype=torch.float32) / dim
+    )
+    return freqs
+def get_group_sizes(shape, num_groups):
+    return [*map(lambda x: x[0] // x[1], zip(shape, num_groups))]
+def rescale_group_rope(num_groups, scale_factor, rescale_factor):
+    num_groups = [*map(lambda x: int(x[0] / x[1]), zip(num_groups, rescale_factor))]
+    scale_factor = [*map(lambda x: x[0] / x[1], zip(scale_factor, rescale_factor))]
+    return num_groups, scale_factor
+def cat_interleave(visual_query_key_value, text_query_key_value, visual_cu_seqlens, text_cu_seqlens):
+    query_key_value = []
+    for local_visual_query_key_value, local_text_query_key_value in zip(
+        torch.split(visual_query_key_value, torch.diff(visual_cu_seqlens).tolist(), dim=1),
+        torch.split(text_query_key_value, torch.diff(text_cu_seqlens).tolist(), dim=1)
+    ):
+        query_key_value += [local_visual_query_key_value, local_text_query_key_value]
+    query_key_value = torch.cat(query_key_value, dim=1)
+    return query_key_value
+def split_interleave(out, cu_seqlens, split_len):
+    visual_out, text_out = [], []
+    for local_out in torch.split(out, torch.diff(cu_seqlens).tolist(), dim=1):
+        visual_out.append(local_out[:, :-split_len])
+        text_out.append(local_out[0, -split_len:])
+    visual_out, text_out = torch.cat(visual_out, dim=1), torch.cat(text_out, dim=0)
+    return visual_out, text_out
+def local_patching(x, shape, group_size, dim=0):
+    duration, height, width = shape
+    g1, g2, g3 = group_size
+    x = x.reshape(*x.shape[:dim], duration//g1, g1, height//g2, g2, width//g3, g3, *x.shape[dim+3:])
+    x = x.permute(
+        *range(len(x.shape[:dim])),
+        dim, dim+2, dim+4, dim+1, dim+3, dim+5,
+        *range(dim+6, len(x.shape))
+    )
+    x = x.flatten(dim, dim+2).flatten(dim+1, dim+3)
+    return x
+def local_merge(x, shape, group_size, dim=0):
+    duration, height, width = shape
+    g1, g2, g3 = group_size
+    x = x.reshape(*x.shape[:dim], duration//g1, height//g2, width//g3, g1, g2, g3, *x.shape[dim+2:])
+    x = x.permute(
+        *range(len(x.shape[:dim])),
+        dim, dim+3, dim+1, dim+4, dim+2, dim+5,
+        *range(dim+6, len(x.shape))
+    )
+    x = x.flatten(dim, dim+1).flatten(dim+1, dim+2).flatten(dim+2, dim+3)
+    return x
+def global_patching(x, shape, group_size, dim=0):
+    latent_group_size = [axis // axis_group_size for axis, axis_group_size in zip(shape, group_size)]
+    x = local_patching(x, shape, latent_group_size, dim)
+    x = x.transpose(dim, dim+1)
+    return x
+def global_merge(x, shape, group_size, dim=0):
+    latent_group_size = [axis // axis_group_size for axis, axis_group_size in zip(shape, group_size)]
+    x = x.transpose(dim, dim+1)
+    x = local_merge(x, shape, latent_group_size, dim)
+    return x
+def to_1dimension(visual_embed, visual_cu_seqlens, visual_shape, num_groups, attention_type):
+    group_size = get_group_sizes(visual_shape, num_groups)
+    if attention_type == 'local':
+        visual_embed = local_patching(visual_embed, visual_shape, group_size, dim=0)
+    if attention_type == 'global':
+        visual_embed = global_patching(visual_embed, visual_shape, group_size, dim=0)
+    visual_cu_seqlens = visual_cu_seqlens * math.prod(group_size[1:])
+    return visual_embed, visual_cu_seqlens
+def to_3dimension(visual_embed, visual_shape, num_groups, attention_type):
+    group_size = get_group_sizes(visual_shape, num_groups)
+    if attention_type == 'local':
+        x = local_merge(visual_embed, visual_shape, group_size, dim=0)
+    if attention_type == 'global':
+        x = global_merge(visual_embed, visual_shape, group_size, dim=0)
+    return x

kandinsky/t2v_pipeline.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from typing import Union, List
+import PIL
+from PIL import Image
+import numpy as np
+from tqdm.auto import tqdm
+import torch
+import torchvision
+from torchvision.transforms import ToPILImage
+from einops import repeat
+from diffusers import AutoencoderKLCogVideoX
+from diffusers import CogVideoXDDIMScheduler
+from .model.dit import DiffusionTransformer3D
+from .model.text_embedders import T5TextEmbedder
+@torch.no_grad()
+def predict_x_0(noise_scheduler, model_output, timesteps, sample, device):
+    init_alpha_device = noise_scheduler.alphas_cumprod.device
+    alphas = noise_scheduler.alphas_cumprod.to(device)
+    alpha_prod_t = alphas[timesteps][:, None, None, None]
+    beta_prod_t = 1 - alpha_prod_t
+    pred_original_sample = (alpha_prod_t ** 0.5) * sample - (beta_prod_t ** 0.5) * model_output
+    noise_scheduler.alphas_cumprod.to(init_alpha_device)
+    return pred_original_sample
+@torch.no_grad()
+def get_velocity(
+    model, x, t, text_embed, visual_cu_seqlens, text_cu_seqlens,
+    num_goups=(1, 1, 1), scale_factor=(1., 1., 1.)
+):
+    pred_velocity = model(x, text_embed, t, visual_cu_seqlens, text_cu_seqlens, num_goups, scale_factor)
+    return pred_velocity
+@torch.no_grad()
+def diffusion_generate_renoise(
+    model, noise_scheduler, shape, device, num_steps, text_embed, visual_cu_seqlens, text_cu_seqlens,
+    num_goups=(1, 1, 1), scale_factor=(1., 1., 1.), progress=False, seed=6554
+):
+    generator = torch.Generator()
+    if seed is not None:
+        generator.manual_seed(seed)
+    img = torch.randn(*shape, generator=generator).to(torch.bfloat16).to(device)
+    noise_scheduler.set_timesteps(num_steps, device=device)
+    timesteps = noise_scheduler.timesteps
+    if progress:
+        timesteps = tqdm(timesteps)
+    for time in timesteps:
+        model_time = time.unsqueeze(0).repeat(visual_cu_seqlens.shape[0] - 1)
+        noise = torch.randn(img.shape, generator=generator).to(torch.bfloat16).to(device)
+        img = noise_scheduler.add_noise(img, noise, time)
+        pred_velocity = get_velocity(
+            model, img.to(torch.bfloat16), model_time,
+            text_embed.to(torch.bfloat16), visual_cu_seqlens,
+            text_cu_seqlens, num_goups, scale_factor
+        )
+        img = predict_x_0(noise_scheduler=noise_scheduler, model_output=pred_velocity.to(device), timesteps=model_time.to(device), sample=img.to(device), device=device)
+    return img
+class Kandinsky4T2VPipeline:
+    def __init__(
+            self,
+            device_map: Union[str, torch.device, dict], # {"dit": cuda:0, "vae": cuda:1, "text_embedder": cuda:1 }
+            dit: DiffusionTransformer3D,
+            text_embedder: T5TextEmbedder,
+            vae: AutoencoderKLCogVideoX,
+            noise_scheduler: CogVideoXDDIMScheduler, # TODO base class
+            resolution: int = 512,
+            local_dit_rank=0,
+            world_size=1,
+    ):
+        if resolution not in [512]:
+            raise ValueError("Resolution can be only 512")
+        self.dit = dit
+        self.noise_scheduler = noise_scheduler
+        self.text_embedder = text_embedder
+        self.vae = vae
+        self.resolution = resolution
+        self.device_map = device_map
+        self.local_dit_rank = local_dit_rank
+        self.world_size = world_size
+        self.RESOLUTIONS = {
+            512: [(512, 512), (352, 736), (736, 352), (384, 672), (672, 384), (480, 544), (544, 480)],
+        }
+    def __call__(
+            self,
+            text: str,
+            save_path: str = "./test.mp4",
+            bs: int = 1,
+            time_length: int = 12, # time in seconds 0 if you want generate image
+            width: int = 512,
+            height: int = 512,
+            seed: int = None,
+            return_frames: bool = False
+    ):
+        num_steps = 4
+        # SEED
+        if seed is None:
+            if self.local_dit_rank == 0:
+                seed = torch.randint(2 ** 63 - 1, (1,)).to(self.local_dit_rank)
+            else:
+                seed = torch.empty((1,), dtype=torch.int64).to(self.local_dit_rank)
+            if self.world_size > 1:
+                torch.distributed.broadcast(seed, 0)
+            seed = seed.item()
+        assert bs == 1
+        if self.resolution != 512:
+            raise NotImplementedError(f"Only 512 resolution is available for now")
+        if (height, width) not in self.RESOLUTIONS[self.resolution]:
+            raise ValueError(f"Wrong height, width pair. Available (height, width) are: {self.RESOLUTIONS[self.resolution]}")
+        if num_steps != 4:
+            raise NotImplementedError(f"In the distilled version number of steps have to be strictly equal to 4")
+        # PREPARATION
+        num_frames = 1 if time_length == 0 else time_length * 8 // 4 + 1
+        num_groups = (1, 1, 1) if self.resolution == 512 else (1, 2, 2)
+        scale_factor = (1., 1., 1.) if self.resolution == 512 else (1., 2., 2.)
+        # TEXT EMBEDDER
+        if self.local_dit_rank == 0:
+            with torch.no_grad():
+                text_embed = self.text_embedder(text).squeeze(0).to(self.local_dit_rank, dtype=torch.bfloat16)
+        else:
+            text_embed = torch.empty(224, 4096, dtype=torch.bfloat16).to(self.local_dit_rank)
+        if self.world_size > 1:
+            torch.distributed.broadcast(text_embed, 0)
+        torch.cuda.empty_cache()
+        visual_cu_seqlens = num_frames * torch.arange(bs + 1, dtype=torch.int32, device=self.device_map["dit"])
+        text_cu_seqlens = text_embed.shape[0] * torch.arange(bs + 1, dtype=torch.int32, device=self.device_map["dit"])
+        bs_text_embed = text_embed.repeat(bs, 1).to(self.device_map["dit"])
+        shape = (bs * num_frames, height // 8, width // 8, 16)
+        # DIT
+        with torch.no_grad():
+            with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+                images = diffusion_generate_renoise(
+                    self.dit, self.noise_scheduler, shape, self.device_map["dit"],
+                    num_steps, bs_text_embed, visual_cu_seqlens, text_cu_seqlens,
+                    num_groups, scale_factor, progress=True, seed=seed,
+                )
+        torch.cuda.empty_cache()
+        # VAE
+        if self.local_dit_rank == 0:
+            self.vae.num_latent_frames_batch_size = 1 if time_length == 0 else 2
+            with torch.no_grad():
+                images = 1 / self.vae.config.scaling_factor * images.to(device=self.device_map["vae"], dtype=torch.bfloat16)
+                images = images.permute(0, 3, 1, 2) if time_length == 0 else images.permute(3, 0, 1, 2)
+                images = self.vae.decode(images.unsqueeze(2 if time_length == 0 else 0)).sample.float()
+                images = torch.clip((images + 1.) / 2., 0., 1.)
+        torch.cuda.empty_cache()
+        if self.local_dit_rank == 0:
+            # RESULTS
+            if time_length == 0:
+                return_images = []
+                for i, image in enumerate(images.squeeze(2).cpu()):
+                    return_images.append(ToPILImage()(image))
+                return return_images
+            else:
+                if return_frames:
+                    return_images = []
+                    for i, image in enumerate(images.squeeze(0).float().permute(1, 0, 2, 3).cpu()):
+                        return_images.append(ToPILImage()(image))
+                    return return_images
+                else:
+                    torchvision.io.write_video(save_path, 255. * images.squeeze(0).float().permute(1, 2, 3, 0).cpu().numpy(), fps=8, options = {"crf": "5"})