Michelangelo / michelangelo /models /tsal /sal_transformer.py
Maikou's picture
all files first commit
9c3a994
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from torch_cluster import fps
from typing import Optional
import math
from michelangelo.models.modules import checkpoint
from michelangelo.models.modules.embedder import FourierEmbedder
from michelangelo.models.modules.distributions import DiagonalGaussianDistribution
from michelangelo.models.modules.transformer_blocks import (
ResidualCrossAttentionBlock,
Transformer
)
from .tsal_base import ShapeAsLatentModule
class CrossAttentionEncoder(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
fourier_embedder: FourierEmbedder,
point_feats: int,
width: int,
heads: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
use_ln_post: bool = False,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
self.num_latents = num_latents
self.fourier_embedder = fourier_embedder
self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width, device=device, dtype=dtype)
self.cross_attn_encoder = ResidualCrossAttentionBlock(
device=device,
dtype=dtype,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias
)
if use_ln_post:
self.ln_post = nn.LayerNorm(width, dtype=dtype, device=device)
else:
self.ln_post = None
def _forward(self, pc, feats):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
Returns:
"""
B, N, _ = pc.shape
batch = torch.arange(B).to(pc.device)
batch = torch.repeat_interleave(batch, N)
data = self.fourier_embedder(pc)
if feats is not None:
data = torch.cat([data, feats], dim=-1)
data = self.input_proj(data)
ratio = self.num_latents / N
flatten_pos = pc.view(B * N, -1) # [B * N, 3]
flatten_data = data.view(B * N, -1) # [B * N, C]
idx = fps(flatten_pos, batch, ratio=ratio)
center_pos = flatten_pos[idx].view(B, self.num_latents, -1)
query = flatten_data[idx].view(B, self. num_latents, -1)
latents = self.cross_attn_encoder(query, data)
if self.ln_post is not None:
latents = self.ln_post(latents)
return latents, center_pos
def forward(self, pc: torch.FloatTensor, feats: Optional[torch.FloatTensor] = None):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
Returns:
dict
"""
return checkpoint(self._forward, (pc, feats), self.parameters(), self.use_checkpoint)
class CrossAttentionDecoder(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
out_channels: int,
fourier_embedder: FourierEmbedder,
width: int,
heads: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
self.fourier_embedder = fourier_embedder
self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
self.cross_attn_decoder = ResidualCrossAttentionBlock(
device=device,
dtype=dtype,
n_data=num_latents,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias
)
self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
self.output_proj = nn.Linear(width, out_channels, device=device, dtype=dtype)
def _forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
queries = self.query_proj(self.fourier_embedder(queries))
x = self.cross_attn_decoder(queries, latents)
x = self.ln_post(x)
x = self.output_proj(x)
return x
def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
return checkpoint(self._forward, (queries, latents), self.parameters(), self.use_checkpoint)
class ShapeAsLatentTransformer(ShapeAsLatentModule):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
point_feats: int = 0,
embed_dim: int = 0,
num_freqs: int = 8,
include_pi: bool = True,
width: int,
layers: int,
heads: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
use_ln_post: bool = False,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
self.num_latents = num_latents
self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
init_scale = init_scale * math.sqrt(1.0 / width)
self.encoder = CrossAttentionEncoder(
device=device,
dtype=dtype,
fourier_embedder=self.fourier_embedder,
num_latents=num_latents,
point_feats=point_feats,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
use_ln_post=use_ln_post,
use_checkpoint=use_checkpoint
)
self.embed_dim = embed_dim
if embed_dim > 0:
# VAE embed
self.pre_kl = nn.Linear(width, embed_dim * 2, device=device, dtype=dtype)
self.post_kl = nn.Linear(embed_dim, width, device=device, dtype=dtype)
self.latent_shape = (num_latents, embed_dim)
else:
self.latent_shape = (num_latents, width)
self.transformer = Transformer(
device=device,
dtype=dtype,
n_ctx=num_latents,
width=width,
layers=layers,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
use_checkpoint=use_checkpoint
)
# geometry decoder
self.geo_decoder = CrossAttentionDecoder(
device=device,
dtype=dtype,
fourier_embedder=self.fourier_embedder,
out_channels=1,
num_latents=num_latents,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
use_checkpoint=use_checkpoint
)
def encode(self,
pc: torch.FloatTensor,
feats: Optional[torch.FloatTensor] = None,
sample_posterior: bool = True):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
sample_posterior (bool):
Returns:
latents (torch.FloatTensor)
center_pos (torch.FloatTensor):
posterior (DiagonalGaussianDistribution or None):
"""
latents, center_pos = self.encoder(pc, feats)
posterior = None
if self.embed_dim > 0:
moments = self.pre_kl(latents)
posterior = DiagonalGaussianDistribution(moments, feat_dim=-1)
if sample_posterior:
latents = posterior.sample()
else:
latents = posterior.mode()
return latents, center_pos, posterior
def decode(self, latents: torch.FloatTensor):
latents = self.post_kl(latents)
return self.transformer(latents)
def query_geometry(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
logits = self.geo_decoder(queries, latents).squeeze(-1)
return logits
def forward(self,
pc: torch.FloatTensor,
feats: torch.FloatTensor,
volume_queries: torch.FloatTensor,
sample_posterior: bool = True):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
volume_queries (torch.FloatTensor): [B, P, 3]
sample_posterior (bool):
Returns:
logits (torch.FloatTensor): [B, P]
center_pos (torch.FloatTensor): [B, M, 3]
posterior (DiagonalGaussianDistribution or None).
"""
latents, center_pos, posterior = self.encode(pc, feats, sample_posterior=sample_posterior)
latents = self.decode(latents)
logits = self.query_geometry(volume_queries, latents)
return logits, center_pos, posterior