|
from abc import ABC, abstractmethod |
|
from typing import Tuple |
|
|
|
import torch |
|
from diffusers.configuration_utils import ConfigMixin |
|
from einops import rearrange |
|
from torch import Tensor |
|
|
|
from xora.utils.torch_utils import append_dims |
|
|
|
|
|
class Patchifier(ConfigMixin, ABC): |
|
def __init__(self, patch_size: int): |
|
super().__init__() |
|
self._patch_size = (1, patch_size, patch_size) |
|
|
|
@abstractmethod |
|
def patchify( |
|
self, latents: Tensor, frame_rates: Tensor, scale_grid: bool |
|
) -> Tuple[Tensor, Tensor]: |
|
pass |
|
|
|
@abstractmethod |
|
def unpatchify( |
|
self, |
|
latents: Tensor, |
|
output_height: int, |
|
output_width: int, |
|
output_num_frames: int, |
|
out_channels: int, |
|
) -> Tuple[Tensor, Tensor]: |
|
pass |
|
|
|
@property |
|
def patch_size(self): |
|
return self._patch_size |
|
|
|
def get_grid( |
|
self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device |
|
): |
|
f = orig_num_frames // self._patch_size[0] |
|
h = orig_height // self._patch_size[1] |
|
w = orig_width // self._patch_size[2] |
|
grid_h = torch.arange(h, dtype=torch.float32, device=device) |
|
grid_w = torch.arange(w, dtype=torch.float32, device=device) |
|
grid_f = torch.arange(f, dtype=torch.float32, device=device) |
|
grid = torch.meshgrid(grid_f, grid_h, grid_w) |
|
grid = torch.stack(grid, dim=0) |
|
grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1) |
|
|
|
if scale_grid is not None: |
|
for i in range(3): |
|
if isinstance(scale_grid[i], Tensor): |
|
scale = append_dims(scale_grid[i], grid.ndim - 1) |
|
else: |
|
scale = scale_grid[i] |
|
grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i] |
|
|
|
grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size) |
|
return grid |
|
|
|
|
|
class SymmetricPatchifier(Patchifier): |
|
def patchify( |
|
self, |
|
latents: Tensor, |
|
) -> Tuple[Tensor, Tensor]: |
|
latents = rearrange( |
|
latents, |
|
"b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)", |
|
p1=self._patch_size[0], |
|
p2=self._patch_size[1], |
|
p3=self._patch_size[2], |
|
) |
|
return latents |
|
|
|
def unpatchify( |
|
self, |
|
latents: Tensor, |
|
output_height: int, |
|
output_width: int, |
|
output_num_frames: int, |
|
out_channels: int, |
|
) -> Tuple[Tensor, Tensor]: |
|
output_height = output_height // self._patch_size[1] |
|
output_width = output_width // self._patch_size[2] |
|
latents = rearrange( |
|
latents, |
|
"b (f h w) (c p q) -> b c f (h p) (w q) ", |
|
f=output_num_frames, |
|
h=output_height, |
|
w=output_width, |
|
p=self._patch_size[1], |
|
q=self._patch_size[2], |
|
) |
|
return latents |
|
|