Spaces:

qitaoz
/

SparseAGS

Build error

App Files Files Community

qitaoz commited on Dec 3, 2024

Commit

4f54ccd

verified ·

1 Parent(s): ee4a9d9

init commit

Browse files

Files changed (14) hide show

sparseags/cam_utils.py +472 -0
sparseags/dust3r_utils.py +66 -0
sparseags/guidance_utils/zero123.py +666 -0
sparseags/guidance_utils/zero123_6d_utils.py +389 -0
sparseags/main_stage1.py +669 -0
sparseags/main_stage2.py +410 -0
sparseags/mesh_utils/grid_put.py +301 -0
sparseags/mesh_utils/mesh.py +638 -0
sparseags/mesh_utils/mesh_renderer.py +268 -0
sparseags/mesh_utils/mesh_utils.py +147 -0
sparseags/render_utils/gs_renderer.py +1102 -0
sparseags/render_utils/util.py +510 -0
sparseags/sh_utils.py +118 -0
sparseags/visual_utils.py +243 -0

sparseags/cam_utils.py ADDED Viewed

	@@ -0,0 +1,472 @@

+import numpy as np
+from scipy.spatial.transform import Rotation as R
+# import ipdb
+import math
+import torch
+import torch.nn.functional as F
+from pytorch3d.transforms import Rotate, Translate
+def intersect_skew_line_groups(p, r, mask):
+    # p, r both of shape (B, N, n_intersected_lines, 3)
+    # mask of shape (B, N, n_intersected_lines)
+    p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask)
+    if p_intersect is None:
+        return None, None, None, None
+    _, p_line_intersect = point_line_distance(
+        p, r, p_intersect[..., None, :].expand_as(p)
+    )
+    intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum(
+        dim=-1
+    )
+    return p_intersect, p_line_intersect, intersect_dist_squared, r
+def intersect_skew_lines_high_dim(p, r, mask=None):
+    # Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions
+    dim = p.shape[-1]
+    # make sure the heading vectors are l2-normed
+    if mask is None:
+        mask = torch.ones_like(p[..., 0])
+    r = torch.nn.functional.normalize(r, dim=-1)
+    eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None]
+    I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None]
+    sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3)
+    # I_eps = torch.zeros_like(I_min_cov.sum(dim=-3)) + 1e-10
+    # p_intersect = torch.pinverse(I_min_cov.sum(dim=-3) + I_eps).matmul(sum_proj)[..., 0]
+    p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
+    # I_min_cov.sum(dim=-3): torch.Size([1, 1, 3, 3])
+    # sum_proj: torch.Size([1, 1, 3, 1])
+    # p_intersect = np.linalg.lstsq(I_min_cov.sum(dim=-3).numpy(), sum_proj.numpy(), rcond=None)[0]
+    if torch.any(torch.isnan(p_intersect)):
+        print(p_intersect)
+        return None, None
+        ipdb.set_trace()
+        assert False
+    return p_intersect, r
+def point_line_distance(p1, r1, p2):
+    df = p2 - p1
+    proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1)
+    line_pt_nearest = p2 - proj_vector
+    d = (proj_vector).norm(dim=-1)
+    return d, line_pt_nearest
+def compute_optical_axis_intersection(cameras, in_ndc=True):
+    centers = cameras.get_camera_center()
+    principal_points = cameras.principal_point
+    one_vec = torch.ones((len(cameras), 1), device=centers.device)
+    optical_axis = torch.cat((principal_points, one_vec), -1)
+    # optical_axis = torch.cat(
+    #     (principal_points, cameras.focal_length[:, 0].unsqueeze(1)), -1
+    # )
+    pp = cameras.unproject_points(optical_axis, from_ndc=in_ndc, world_coordinates=True)
+    pp2 = torch.diagonal(pp, dim1=0, dim2=1).T
+    directions = pp2 - centers
+    centers = centers.unsqueeze(0).unsqueeze(0)
+    directions = directions.unsqueeze(0).unsqueeze(0)
+    p_intersect, p_line_intersect, _, r = intersect_skew_line_groups(
+        p=centers, r=directions, mask=None
+    )
+    if p_intersect is None:
+        dist = None
+    else:
+        p_intersect = p_intersect.squeeze().unsqueeze(0)
+        dist = (p_intersect - centers).norm(dim=-1)
+    return p_intersect, dist, p_line_intersect, pp2, r
+def normalize_cameras_with_up_axis(cameras, sequence_name, scale=1.0, in_ndc=True):
+    """
+    Normalizes cameras such that the optical axes point to the origin and the average
+    distance to the origin is 1.
+    Args:
+        cameras (List[camera]).
+    """
+    # Let distance from first camera to origin be unit
+    new_cameras = cameras.clone()
+    new_transform = new_cameras.get_world_to_view_transform()
+    p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection(
+        cameras,
+        in_ndc=in_ndc
+    )
+    t = Translate(p_intersect)
+    # scale = dist.squeeze()[0]
+    scale = dist.squeeze().mean()
+    # Degenerate case
+    if scale == 0:
+        print(cameras.T)
+        print(new_transform.get_matrix()[:, 3, :3])
+        return -1
+    assert scale != 0
+    new_transform = t.compose(new_transform)
+    new_cameras.R = new_transform.get_matrix()[:, :3, :3]
+    new_cameras.T = new_transform.get_matrix()[:, 3, :3] / scale * 1.85
+    needs_checking = False
+    # ===== Rotation normalization
+    # Estimate the world 'up' direction assuming that yaw is small
+    # and running SVD on the x-vectors of the cameras
+    x_vectors = new_cameras.R.transpose(1, 2)[:, 0, :].clone()
+    x_vectors -= x_vectors.mean(dim=0, keepdim=True)
+    U, S, Vh = torch.linalg.svd(x_vectors)
+    V = Vh.mH
+    # vector with the smallest variation is to the normal to
+    # the plane of x-vectors (assume this to be the up direction)
+    if S[0] / S[1] > S[1] / S[2]:
+        print('Warning: unexpected singular values in sequence {}: {}'.format(sequence_name, S))
+        needs_checking = True
+        # return None, None, None, None, None
+    estimated_world_up = V[:, 2:]
+    # check all cameras have the same y-direction
+    for camera_idx in range(len(new_cameras.T)):
+        if torch.sign(torch.dot(estimated_world_up[:, 0],
+                                new_cameras.R[0].transpose(0,1)[1, :])) != torch.sign(torch.dot(estimated_world_up[:, 0],
+                                    new_cameras.R[camera_idx].transpose(0,1)[1, :])):
+            print("Some cameras appear to be flipped in sequence {}".format(sequence_name) )
+            needs_checking = True
+            # return None, None, None, None, None
+    flip = torch.sign(torch.dot(estimated_world_up[:, 0], new_cameras.R[0].transpose(0,1)[1, :])) < 0
+    if flip:
+        estimated_world_up = V[:, 2:] * -1
+    # build the target coordinate basis using the estimated world up
+    target_coordinate_basis = torch.cat([V[:, :1],
+                                        estimated_world_up,
+                                        torch.linalg.cross(V[:, :1], estimated_world_up, dim=0)],
+                                        dim=1)
+    new_cameras.R = torch.matmul(target_coordinate_basis.T, new_cameras.R)
+    return new_cameras, p_intersect, p_line_intersect, pp, r, needs_checking
+def dot(x, y):
+    if isinstance(x, np.ndarray):
+        return np.sum(x * y, -1, keepdims=True)
+    else:
+        return torch.sum(x * y, -1, keepdim=True)
+def length(x, eps=1e-20):
+    if isinstance(x, np.ndarray):
+        return np.sqrt(np.maximum(np.sum(x * x, axis=-1, keepdims=True), eps))
+    else:
+        return torch.sqrt(torch.clamp(dot(x, x), min=eps))
+def safe_normalize(x, eps=1e-20):
+    return x / length(x, eps)
+def look_at(campos, target, opengl=True):
+    # campos: [N, 3], camera/eye position
+    # target: [N, 3], object to look at
+    # return: [N, 3, 3], rotation matrix
+    if not opengl:
+        # camera forward aligns with -z
+        forward_vector = safe_normalize(target - campos)
+        up_vector = np.array([0, 1, 0], dtype=np.float32)
+        right_vector = safe_normalize(np.cross(forward_vector, up_vector))
+        up_vector = safe_normalize(np.cross(right_vector, forward_vector))
+    else:
+        # camera forward aligns with +z
+        forward_vector = safe_normalize(campos - target)
+        up_vector = np.array([0, 1, 0], dtype=np.float32)
+        right_vector = safe_normalize(np.cross(up_vector, forward_vector))
+        up_vector = safe_normalize(np.cross(forward_vector, right_vector))
+    R = np.stack([right_vector, up_vector, forward_vector], axis=1)
+    return R
+# elevation & azimuth to pose (cam2world) matrix
+def orbit_camera(elevation, azimuth, radius=1, is_degree=True, target=None, opengl=True):
+    # radius: scalar
+    # elevation: scalar, in (-90, 90), from +y to -y is (-90, 90)
+    # azimuth: scalar, in (-180, 180), from +z to +x is (0, 90)
+    # return: [4, 4], camera pose matrix
+    if is_degree:
+        elevation = np.deg2rad(elevation)
+        azimuth = np.deg2rad(azimuth)
+    x = radius * np.cos(elevation) * np.sin(azimuth)
+    y = - radius * np.sin(elevation)
+    z = radius * np.cos(elevation) * np.cos(azimuth)
+    if target is None:
+        target = np.zeros([3], dtype=np.float32)
+    campos = np.array([x, y, z]) + target  # [3]
+    T = np.eye(4, dtype=np.float32)
+    T[:3, :3] = look_at(campos, target, opengl)
+    T[:3, 3] = campos
+    return T
+def mat2latlon(T):
+    if not isinstance(T, np.ndarray):
+        xyz = T.cpu().detach().numpy()
+    else:
+        xyz = T.copy()
+    r = np.linalg.norm(xyz)
+    xyz = xyz / r
+    theta = -np.arcsin(xyz[1])
+    azi = np.arctan2(xyz[0], xyz[2])
+    return np.rad2deg(theta), np.rad2deg(azi), r
+def extract_camera_properties(camera_to_world_matrix):
+    # Camera position is the translation part of the matrix
+    camera_position = camera_to_world_matrix[:3, 3]
+    # Extracting the forward direction vector (third column of rotation matrix)
+    forward = camera_to_world_matrix[:3, 2]
+    return camera_position, forward
+def compute_angular_error_batch(rotation1, rotation2):
+    R_rel = np.einsum("Bij,Bjk ->Bik", rotation1.transpose(0, 2, 1), rotation2)
+    t = (np.trace(R_rel, axis1=1, axis2=2) - 1) / 2
+    theta = np.arccos(np.clip(t, -1, 1))
+    return theta * 180 / np.pi
+def find_mask_center_and_translate(image, mask):
+    """
+    Calculate the center of the mask and translate the image such that
+    the mask center is at the image center.
+    Args:
+    - image (torch.Tensor): Input image tensor of shape (N, C, H, W)
+    - mask (torch.Tensor): Mask tensor of shape (N, 1, H, W)
+    Returns:
+    - Translated image of shape (N, C, H, W)
+    """
+    _, _, h, w = image.shape
+    # Calculate the center of mass of the mask
+    # Note: mask should be a binary mask of the same spatial dimensions as the image
+    y_coords, x_coords = torch.meshgrid(torch.arange(0, h), torch.arange(0, w), indexing='ij')
+    total_mass = mask.sum(dim=[2, 3], keepdim=True)
+    x_center = (mask * x_coords.to(image.device)).sum(dim=[2, 3], keepdim=True) / total_mass
+    y_center = (mask * y_coords.to(image.device)).sum(dim=[2, 3], keepdim=True) / total_mass
+    # Calculate the translation needed to move the mask center to the image center
+    image_center_x, image_center_y = w // 2, h // 2
+    delta_x = x_center.squeeze() - image_center_x
+    delta_y = y_center.squeeze() - image_center_y
+    return torch.tensor([delta_x, delta_y])
+def create_voxel_grid(length, resolution=64):
+    """
+    Creates a voxel grid.
+    xyz_range: ((min_x, max_x), (min_y, max_y), (min_z, max_z))
+    resolution: The number of divisions along each axis.
+    Returns a 4D tensor representing the voxel grid, with each voxel initialized to 1 (solid).
+    """
+    x = torch.linspace(-length, length, resolution)
+    y = torch.linspace(-length, length, resolution)
+    z = torch.linspace(-length, length, resolution)
+    xx, yy, zz = torch.meshgrid(x, y, z, indexing='ij')
+    voxels = torch.stack([xx, yy, zz, torch.ones_like(xx)], dim=-1)  # Homogeneous coordinates
+    return voxels
+def project_voxels_to_image(voxels, camera):
+    """
+    Projects voxel centers into the camera's image plane.
+    voxels: 4D tensor of voxel grid in homogeneous coordinates.
+    K: Camera intrinsic matrix.
+    R: Camera rotation matrix.
+    t: Camera translation vector.
+    Returns a tensor of projected 2D points in image coordinates.
+    """
+    device = voxels.device
+    # K, R, t = torch.tensor(K, device=device), torch.tensor(R, device=device), torch.tensor(t, device=device)
+    # Flatten voxels to shape (N, 4) for matrix multiplication
+    N = voxels.nelement() // 4  # Total number of voxels
+    voxels_flat = voxels.reshape(-1, 4).t()  # Shape (4, N)
+    # # Apply extrinsic parameters (rotation and translation)
+    # transformed_voxels = R @ voxels_flat[:3, :] + t[:, None]
+    # # Apply intrinsic parameters
+    # projected_voxels = K @ transformed_voxels
+    projected_voxels = camera.projection_matrix.transpose(0, 1) @ camera.world_view_transform.transpose(0, 1) @ voxels_flat
+    # Convert from homogeneous coordinates to 2D
+    projected_voxels_2d = (projected_voxels[:2, :] / projected_voxels[3, :]).t() # Reshape to grid dimensions with 2D points
+    projected_voxels_2d = (projected_voxels_2d.reshape(*voxels.shape[:-1], 2) + 1.) * 255 * 0.5
+    return projected_voxels_2d
+def carve_voxels(voxel_grid, projected_points, mask):
+    """
+    Updates the voxel grid based on the comparison with the mask.
+    voxel_grid: 3D tensor representing the voxel grid.
+    projected_points: Projected 2D points in image coordinates.
+    mask: Binary mask image.
+    """
+    # Convert projected points to indices in the mask
+    indices_x = torch.clamp(projected_points[..., 0], 0, mask.shape[1] - 1).long()
+    indices_y = torch.clamp(projected_points[..., 1], 0, mask.shape[0] - 1).long()
+    # Check if projected points are within the object in the mask
+    in_object = mask[indices_y, indices_x]
+    # Carve out voxels where the projection does not fall within the object
+    voxel_grid[in_object == 0] = 0
+def sample_points_from_voxel(cameras, masks, length=1, resolution=64, N=5000, inverse=False, device="cuda"):
+    """
+    Randomly sample N points from solid regions in a voxel grid.
+    Args:
+    - voxel_grid (torch.Tensor): A 3D tensor representing the voxel grid after carving.
+      Solid regions are marked with 1s.
+    - N (int): The number of points to sample.
+    Returns:
+    - sampled_points (torch.Tensor): A tensor of shape (N, 3) representing the sampled 3D coordinates.
+    """
+    voxel_grid = create_voxel_grid(length, resolution).to(device)
+    voxel_grid_indicator = torch.ones(resolution, resolution, resolution)
+    masks = torch.from_numpy(masks).to(device).squeeze()
+    for idx, cam in enumerate(cameras):
+        projected_points = project_voxels_to_image(voxel_grid, cam)
+        carve_voxels(voxel_grid_indicator, projected_points, masks[idx])
+    voxel_grid_indicator = voxel_grid_indicator.reshape(resolution, resolution, resolution)
+    # Identify the indices of solid voxels
+    if inverse:
+        solid_indices = torch.nonzero(voxel_grid_indicator == 0)
+    else:
+        solid_indices = torch.nonzero(voxel_grid_indicator == 1)
+    # Randomly select N indices from the solid indices
+    if N <= solid_indices.size(0):
+        # Randomly select N indices from the solid indices if there are enough solid voxels
+        sampled_indices = solid_indices[torch.randperm(solid_indices.size(0))[:N]]
+    else:
+        # If there are not enough solid voxels, sample with replacement
+        sampled_indices = solid_indices[torch.randint(0, solid_indices.size(0), (N,))]
+    # Convert indices to coordinates
+    # Note: This step assumes the voxel grid spans from 0 to 1 in each dimension.
+    # Adjust accordingly if your grid spans a different range.
+    sampled_points = sampled_indices.float() / (voxel_grid.size(0) - 1) * 2 * length - length
+    return sampled_points
+class OrbitCamera:
+    def __init__(self, W, H, r=2, fovy=60, near=0.01, far=100):
+        self.W = W
+        self.H = H
+        self.radius = r  # camera distance from center
+        self.fovy = np.deg2rad(fovy)  # deg 2 rad
+        self.near = near
+        self.far = far
+        self.center = np.array([0, 0, 0], dtype=np.float32)  # look at this point
+        self.rot = R.from_matrix(np.eye(3))
+        self.up = np.array([0, 1, 0], dtype=np.float32)  # need to be normalized!
+    @property
+    def fovx(self):
+        return 2 * np.arctan(np.tan(self.fovy / 2) * self.W / self.H)
+    @property
+    def campos(self):
+        return self.pose[:3, 3]
+    # pose (c2w)
+    @property
+    def pose(self):
+        # first move camera to radius
+        res = np.eye(4, dtype=np.float32)
+        res[2, 3] = self.radius  # opengl convention...
+        # rotate
+        rot = np.eye(4, dtype=np.float32)
+        rot[:3, :3] = self.rot.as_matrix()
+        res = rot @ res
+        # translate
+        res[:3, 3] -= self.center
+        return res
+    # view (w2c)
+    @property
+    def view(self):
+        return np.linalg.inv(self.pose)
+    # projection (perspective)
+    @property
+    def perspective(self):
+        y = np.tan(self.fovy / 2)
+        aspect = self.W / self.H
+        return np.array(
+            [
+                [1 / (y * aspect), 0, 0, 0],
+                [0, -1 / y, 0, 0],
+                [
+                    0,
+                    0,
+                    -(self.far + self.near) / (self.far - self.near),
+                    -(2 * self.far * self.near) / (self.far - self.near),
+                ],
+                [0, 0, -1, 0],
+            ],
+            dtype=np.float32,
+        )
+    # intrinsics
+    @property
+    def intrinsics(self):
+        focal = self.H / (2 * np.tan(self.fovy / 2))
+        return np.array([focal, focal, self.W // 2, self.H // 2], dtype=np.float32)
+    @property
+    def mvp(self):
+        return self.perspective @ np.linalg.inv(self.pose)  # [4, 4]
+    def orbit(self, dx, dy):
+        # rotate along camera up/side axis!
+        side = self.rot.as_matrix()[:3, 0]
+        rotvec_x = self.up * np.radians(-0.05 * dx)
+        rotvec_y = side * np.radians(-0.05 * dy)
+        self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
+    def scale(self, delta):
+        self.radius *= 1.1 ** (-delta)
+    def pan(self, dx, dy, dz=0):
+        # pan in camera coordinate system (careful on the sensitivity!)
+        self.center += 0.0005 * self.rot.as_matrix()[:3, :3] @ np.array([-dx, -dy, dz])

sparseags/dust3r_utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from pytorch3d.renderer import PerspectiveCameras
+import sys
+sys.path.append('./')
+from sparseags.cam_utils import normalize_cameras_with_up_axis
+sys.path[0] = sys.path[0] + '/dust3r'
+from dust3r.inference import inference
+from dust3r.utils.image import load_images
+from dust3r.image_pairs import make_pairs
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+def infer_dust3r(dust3r_model, file_names, device='cuda'):
+	batch_size = 1
+	schedule = 'cosine'
+	lr = 0.01
+	niter = 300
+	images = load_images(file_names, size=224)
+	pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
+	output = inference(pairs, dust3r_model, device, batch_size=batch_size)
+	scene = global_aligner(output, optimize_pp=True, device=device, mode=GlobalAlignerMode.PointCloudOptimizer)
+	loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)
+	# retrieve useful values from scene:
+	imgs = scene.imgs
+	cams2world = scene.get_im_poses()
+	w2c = torch.linalg.inv(cams2world)
+	pps = scene.get_principal_points() * 256 / 224
+	focals = scene.get_focals() * 256 / 224
+	w2c[:, :2] *= -1  # OpenCV to PyTorch3D
+	Rs = w2c[:, :3, :3].transpose(1, 2)
+	Ts = w2c[:, :3, 3]
+	cameras = PerspectiveCameras(
+		focal_length=focals,
+		principal_point=pps,
+		in_ndc=False,
+		R=Rs,
+		T=Ts,
+	)
+	normalized_cameras, _, _, _, _, needs_checking = normalize_cameras_with_up_axis(cameras, None, in_ndc=False)
+	if normalized_cameras is None:
+		print("It seems something wrong...")
+		return 0
+	data = {}
+	base_names = [file_name.split('/')[-1].split('.')[0] for file_name in file_names]
+	file_names = [file_name.replace('source', 'processed').replace('.png', '_rgba.png') for file_name in file_names]
+	for idx, base_name in enumerate(base_names):
+		data[base_name] = {}
+		data[base_name]["R"] = normalized_cameras.R[idx].cpu().tolist()
+		data[base_name]["T"] = normalized_cameras.T[idx].cpu().tolist()
+		data[base_name]["needs_checking"] = needs_checking
+		data[base_name]["principal_point"] = normalized_cameras.principal_point[idx].cpu().tolist()
+		data[base_name]["focal_length"] = normalized_cameras.focal_length[idx].cpu().tolist()
+		data[base_name]["flag"] = 1
+		data[base_name]["filepath"] = file_names[idx]
+	return data

sparseags/guidance_utils/zero123.py ADDED Viewed

	@@ -0,0 +1,666 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+import PIL
+import torch
+import torchvision.transforms.functional as TF
+from diffusers.configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, is_accelerate_available, logging
+from diffusers.utils.torch_utils import randn_tensor
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CLIPCameraProjection(ModelMixin, ConfigMixin):
+    """
+    A Projection layer for CLIP embedding and camera embedding.
+    Parameters:
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `clip_embed`
+        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
+            additional_embeddings`.
+    """
+    @register_to_config
+    def __init__(self, embedding_dim: int = 768, additional_embeddings: int = 4):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.additional_embeddings = additional_embeddings
+        self.input_dim = self.embedding_dim + self.additional_embeddings
+        self.output_dim = self.embedding_dim
+        self.proj = torch.nn.Linear(self.input_dim, self.output_dim)
+    def forward(
+        self,
+        embedding: torch.FloatTensor,
+    ):
+        """
+        The [`PriorTransformer`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, input_dim)`):
+                The currently input embeddings.
+        Returns:
+            The output embedding projection (`torch.FloatTensor` of shape `(batch_size, output_dim)`).
+        """
+        proj_embedding = self.proj(embedding)
+        return proj_embedding
+class Zero123Pipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate variations from an input image using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    # TODO: feature_extractor is required to encode images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+    _optional_components = ["safety_checker"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        clip_camera_projection: CLIPCameraProjection,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if safety_checker is None and requires_safety_checker:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(
+            unet.config, "_diffusers_version"
+        ) and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse(
+            "0.9.0.dev0"
+        )
+        is_unet_sample_size_less_64 = (
+            hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        )
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate(
+                "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            clip_camera_projection=clip_camera_projection,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [
+            self.unet,
+            self.image_encoder,
+            self.vae,
+            self.safety_checker,
+        ]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_image(
+        self,
+        image,
+        elevation,
+        azimuth,
+        distance,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        clip_image_embeddings=None,
+        image_camera_embeddings=None,
+    ):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if image_camera_embeddings is None:
+            if image is None:
+                assert clip_image_embeddings is not None
+                image_embeddings = clip_image_embeddings.to(device=device, dtype=dtype)
+            else:
+                if not isinstance(image, torch.Tensor):
+                    image = self.feature_extractor(
+                        images=image, return_tensors="pt"
+                    ).pixel_values
+                image = image.to(device=device, dtype=dtype)
+                image_embeddings = self.image_encoder(image).image_embeds
+                image_embeddings = image_embeddings.unsqueeze(1)
+            bs_embed, seq_len, _ = image_embeddings.shape
+            if isinstance(elevation, float):
+                elevation = torch.as_tensor(
+                    [elevation] * bs_embed, dtype=dtype, device=device
+                )
+            if isinstance(azimuth, float):
+                azimuth = torch.as_tensor(
+                    [azimuth] * bs_embed, dtype=dtype, device=device
+                )
+            if isinstance(distance, float):
+                distance = torch.as_tensor(
+                    [distance] * bs_embed, dtype=dtype, device=device
+                )
+            camera_embeddings = torch.stack(
+                [
+                    torch.deg2rad(elevation),
+                    torch.sin(torch.deg2rad(azimuth)),
+                    torch.cos(torch.deg2rad(azimuth)),
+                    distance,
+                ],
+                dim=-1,
+            )[:, None, :]
+            image_embeddings = torch.cat([image_embeddings, camera_embeddings], dim=-1)
+            # project (image, camera) embeddings to the same dimension as clip embeddings
+            image_embeddings = self.clip_camera_projection(image_embeddings)
+        else:
+            image_embeddings = image_camera_embeddings.to(device=device, dtype=dtype)
+            bs_embed, seq_len, _ = image_embeddings.shape
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        return image_embeddings
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(
+                    image, output_type="pil"
+                )
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(
+                feature_extractor_input, return_tensors="pt"
+            ).to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        warnings.warn(
+            "The decode_latents method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor instead",
+            FutureWarning,
+        )
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, image, height, width, callback_steps):
+        # TODO: check image size or adjust image size to (height, width)
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _get_latent_model_input(
+        self,
+        latents: torch.FloatTensor,
+        image: Optional[
+            Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]
+        ],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        image_latents: Optional[torch.FloatTensor] = None,
+    ):
+        if isinstance(image, PIL.Image.Image):
+            image_pt = TF.to_tensor(image).unsqueeze(0).to(latents)
+        elif isinstance(image, list):
+            image_pt = torch.stack([TF.to_tensor(img) for img in image], dim=0).to(
+                latents
+            )
+        elif isinstance(image, torch.Tensor):
+            image_pt = image
+        else:
+            image_pt = None
+        if image_pt is None:
+            assert image_latents is not None
+            image_pt = image_latents.repeat_interleave(num_images_per_prompt, dim=0)
+        else:
+            image_pt = image_pt * 2.0 - 1.0  # scale to [-1, 1]
+            # FIXME: encoded latents should be multiplied with self.vae.config.scaling_factor
+            # but zero123 was not trained this way
+            image_pt = self.vae.encode(image_pt).latent_dist.mode()
+            image_pt = image_pt.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            latent_model_input = torch.cat(
+                [
+                    torch.cat([latents, latents], dim=0),
+                    torch.cat([torch.zeros_like(image_pt), image_pt], dim=0),
+                ],
+                dim=1,
+            )
+        else:
+            latent_model_input = torch.cat([latents, image_pt], dim=1)
+        return latent_model_input
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[
+            Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]
+        ] = None,
+        elevation: Optional[Union[float, torch.FloatTensor]] = None,
+        azimuth: Optional[Union[float, torch.FloatTensor]] = None,
+        distance: Optional[Union[float, torch.FloatTensor]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3.0,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        clip_image_embeddings: Optional[torch.FloatTensor] = None,
+        image_camera_embeddings: Optional[torch.FloatTensor] = None,
+        image_latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
+                configuration of
+                [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
+                `CLIPImageProcessor`
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        # TODO: check input elevation, azimuth, and distance
+        # TODO: check image, clip_image_embeddings, image_latents
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        else:
+            assert image_latents is not None
+            assert (
+                clip_image_embeddings is not None or image_camera_embeddings is not None
+            )
+            batch_size = image_latents.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input image
+        if isinstance(image, PIL.Image.Image) or isinstance(image, list):
+            pil_image = image
+        elif isinstance(image, torch.Tensor):
+            pil_image = [TF.to_pil_image(image[i]) for i in range(image.shape[0])]
+        else:
+            pil_image = None
+        image_embeddings = self._encode_image(
+            pil_image,
+            elevation,
+            azimuth,
+            distance,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            clip_image_embeddings,
+            image_camera_embeddings,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        # num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = 4  # FIXME: hard-coded
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = self._get_latent_model_input(
+                    latents,
+                    image,
+                    num_images_per_prompt,
+                    do_classifier_free_guidance,
+                    image_latents,
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, image_embeddings.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

sparseags/guidance_utils/zero123_6d_utils.py ADDED Viewed

	@@ -0,0 +1,389 @@

+from diffusers import DDIMScheduler
+import torchvision.transforms.functional as TF
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torchvision
+from torchvision.utils import save_image
+from torchvision import transforms
+import torch.nn.functional as F
+from einops import rearrange
+import sys
+sys.path.append('./')
+from sparseags.guidance_utils.zero123 import Zero123Pipeline
+name_mapping = {
+    "model.diffusion_model.input_blocks.1.1.": "down_blocks.0.attentions.0.",
+    "model.diffusion_model.input_blocks.2.1.": "down_blocks.0.attentions.1.",
+    "model.diffusion_model.input_blocks.4.1.": "down_blocks.1.attentions.0.",
+    "model.diffusion_model.input_blocks.5.1.": "down_blocks.1.attentions.1.",
+    "model.diffusion_model.input_blocks.7.1.": "down_blocks.2.attentions.0.",
+    "model.diffusion_model.input_blocks.8.1.": "down_blocks.2.attentions.1.",
+    "model.diffusion_model.middle_block.1.": "mid_block.attentions.0.",
+    "model.diffusion_model.output_blocks.3.1.": "up_blocks.1.attentions.0.",
+    "model.diffusion_model.output_blocks.4.1.": "up_blocks.1.attentions.1.",
+    "model.diffusion_model.output_blocks.5.1.": "up_blocks.1.attentions.2.",
+    "model.diffusion_model.output_blocks.6.1.": "up_blocks.2.attentions.0.",
+    "model.diffusion_model.output_blocks.7.1.": "up_blocks.2.attentions.1.",
+    "model.diffusion_model.output_blocks.8.1.": "up_blocks.2.attentions.2.",
+    "model.diffusion_model.output_blocks.9.1.": "up_blocks.3.attentions.0.",
+    "model.diffusion_model.output_blocks.10.1.": "up_blocks.3.attentions.1.",
+    "model.diffusion_model.output_blocks.11.1.": "up_blocks.3.attentions.2.",
+}
+class Zero123(nn.Module):
+    def __init__(self, device, fp16=True, t_range=[0.02, 0.98], model_key="ashawkey/zero123-xl-diffusers"):
+        super().__init__()
+        self.device = device
+        self.fp16 = fp16
+        self.dtype = torch.float16 if fp16 else torch.float32
+        self.pipe = Zero123Pipeline.from_pretrained(
+            model_key,
+            trust_remote_code=True,
+            torch_dtype=self.dtype,
+        ).to(self.device)
+        # load weights from the checkpoint
+        ckpt_path = "checkpoints/zero123_6dof_23k.ckpt"
+        print(f'[INFO] loading checkpoint from {ckpt_path} ...')
+        old_state = torch.load(ckpt_path)
+        pretrained_weights = old_state['state_dict']['cc_projection.weight']
+        pretrained_biases = old_state['state_dict']['cc_projection.bias']
+        linear_layer = torch.nn.Linear(768 + 18, 768)
+        linear_layer.weight.data = pretrained_weights
+        linear_layer.bias.data = pretrained_biases
+        self.pipe.clip_camera_projection.proj = linear_layer.to(dtype=self.dtype, device=self.device)
+        for name in list(old_state['state_dict'].keys()):
+            for k, v in name_mapping.items():
+                if k in name:
+                    old_state['state_dict'][name.replace(k, name_mapping[k])] = old_state['state_dict'][name].to(dtype=self.dtype, device=self.device)
+        m, u = self.pipe.unet.load_state_dict(old_state['state_dict'], strict=False)
+        # stable-zero123 has a different camera embedding
+        self.use_stable_zero123 = 'stable' in model_key
+        self.pipe.image_encoder.eval()
+        self.pipe.vae.eval()
+        self.pipe.unet.eval()
+        self.pipe.clip_camera_projection.eval()
+        self.vae = self.pipe.vae
+        self.unet = self.pipe.unet
+        self.pipe.set_progress_bar_config(disable=True)
+        self.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
+        self.num_train_timesteps = self.scheduler.config.num_train_timesteps
+        self.min_step = int(self.num_train_timesteps * t_range[0])
+        self.max_step = int(self.num_train_timesteps * t_range[1])
+        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
+        self.embeddings = None
+    @torch.no_grad()
+    def get_img_embeds(self, x):
+        # x: image tensor in [0, 1]
+        x = F.interpolate(x, (256, 256), mode='bilinear', align_corners=False)
+        x_pil = [TF.to_pil_image(image) for image in x]
+        x_clip = self.pipe.feature_extractor(images=x_pil, return_tensors="pt").pixel_values.to(device=self.device, dtype=self.dtype)
+        c = self.pipe.image_encoder(x_clip).image_embeds
+        v = self.encode_imgs(x.to(self.dtype)) / self.vae.config.scaling_factor
+        self.embeddings = [c, v]
+    def get_cam_embeddings(self, polar, azimuth, radius, default_elevation=0):
+        if self.use_stable_zero123:
+            T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), np.deg2rad([90 + default_elevation] * len(polar))], axis=-1)
+        else:
+            # original zero123 camera embedding
+            T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1)
+        T = torch.from_numpy(T).unsqueeze(1).to(dtype=self.dtype, device=self.device) # [8, 1, 4]
+        return T
+    def get_cam_embeddings_6D(self, target_RT, cond_RT):
+        T_target = torch.from_numpy(target_RT["c2w"])
+        focal_len_target = torch.from_numpy(target_RT["focal_length"])
+        T_cond = torch.from_numpy(cond_RT["c2w"])
+        focal_len_cond = torch.from_numpy(cond_RT["focal_length"])
+        focal_len = focal_len_target / focal_len_cond
+        d_T = torch.linalg.inv(T_target) @ T_cond
+        d_T = torch.cat([d_T.flatten(), torch.log(focal_len)])
+        return d_T.unsqueeze(0).unsqueeze(0).to(dtype=self.dtype, device=self.device)
+    @torch.no_grad()
+    def refine(self, pred_rgb, cam_embed,
+               guidance_scale=5, steps=50, strength=0.8, idx=None
+        ):
+        ######## Slight modification ########
+        if pred_rgb is not None:
+            batch_size = pred_rgb.shape[0]
+        else:
+            batch_size = 1
+        self.scheduler.set_timesteps(steps)
+        if strength == 0:
+            init_step = 0
+            latents = torch.randn((1, 4, 32, 32), device=self.device, dtype=self.dtype)
+        else:
+            init_step = int(steps * strength)
+            pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False)
+            latents = self.encode_imgs(pred_rgb_256.to(self.dtype))
+            latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step])
+        T = cam_embed
+        if idx is not None:
+            cc_emb = torch.cat([self.embeddings[0][idx].repeat(batch_size, 1, 1), T], dim=-1)
+        else:
+            cc_emb = torch.cat([self.embeddings[0].repeat(batch_size, 1, 1), T], dim=-1)
+        cc_emb = self.pipe.clip_camera_projection(cc_emb)
+        cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0)
+        if idx is not None:
+            vae_emb = self.embeddings[1][idx].repeat(batch_size, 1, 1, 1)
+        else:
+            vae_emb = self.embeddings[1].repeat(batch_size, 1, 1, 1)
+        vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0)
+        for i, t in enumerate(self.scheduler.timesteps[init_step:]):
+            x_in = torch.cat([latents] * 2)
+            t_in = torch.cat([t.view(1)]).to(self.device)
+            noise_pred = self.unet(
+                torch.cat([x_in, vae_emb], dim=1),
+                t_in.to(self.unet.dtype),
+                encoder_hidden_states=cc_emb,
+            ).sample
+            noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        imgs = self.decode_latents(latents) # [1, 3, 256, 256]
+        return imgs
+    def train_step(self, pred_rgb, polar, azimuth, radius, step_ratio=None, guidance_scale=5, as_latent=False):
+        # pred_rgb: tensor [1, 3, H, W] in [0, 1]
+        batch_size = pred_rgb.shape[0]
+        if as_latent:
+            latents = F.interpolate(pred_rgb, (32, 32), mode='bilinear', align_corners=False) * 2 - 1
+        else:
+            pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False)
+            latents = self.encode_imgs(pred_rgb_256.to(self.dtype))
+        if step_ratio is not None:
+            # dreamtime-like
+            # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio)
+            t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step)
+            t = torch.full((batch_size,), t, dtype=torch.long, device=self.device)
+        else:
+            t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device)
+        w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1)
+        with torch.no_grad():
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            x_in = torch.cat([latents_noisy] * 2)
+            t_in = torch.cat([t] * 2)
+            T = self.get_cam_embeddings(polar, azimuth, radius)
+            cc_emb = torch.cat([self.embeddings[0].repeat(batch_size, 1, 1), T], dim=-1)
+            cc_emb = self.pipe.clip_camera_projection(cc_emb)
+            cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0)
+            vae_emb = self.embeddings[1].repeat(batch_size, 1, 1, 1)
+            vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0)
+            noise_pred = self.unet(
+                torch.cat([x_in, vae_emb], dim=1),
+                t_in.to(self.unet.dtype),
+                encoder_hidden_states=cc_emb,
+            ).sample
+        noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+        grad = w * (noise_pred - noise)
+        grad = torch.nan_to_num(grad)
+        target = (latents - grad).detach()
+        loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum')
+        return loss
+    def angle_between(self, sph_v1, sph_v2):
+        def sph2cart(sv):
+            r, theta, phi = sv[0], sv[1], sv[2]
+            # The polar representation is different from Stable-DreamFusion
+            return torch.tensor([r * torch.cos(theta) * torch.cos(phi), r * torch.cos(theta) * torch.sin(phi), r * torch.sin(theta)])
+        def unit_vector(v):
+            return v / torch.linalg.norm(v)
+        def angle_between_2_sph(sv1, sv2):
+            v1, v2 = sph2cart(sv1), sph2cart(sv2)
+            v1_u, v2_u = unit_vector(v1), unit_vector(v2)
+            return torch.arccos(torch.clip(torch.dot(v1_u, v2_u), -1.0, 1.0))
+        angles = torch.empty(len(sph_v1), len(sph_v2))
+        for i, sv1 in enumerate(sph_v1):
+            for j, sv2 in enumerate(sph_v2):
+                angles[i][j] = angle_between_2_sph(sv1, sv2)
+        return angles
+    def batch_train_step(self, pred_rgb, target_RT, cond_cams, step_ratio=None, guidance_scale=5, as_latent=False, step=None):
+        # pred_rgb: tensor [1, 3, H, W] in [0, 1]
+        batch_size = pred_rgb.shape[0]
+        if as_latent:
+            latents = F.interpolate(pred_rgb, (32, 32), mode='bilinear', align_corners=False) * 2 - 1
+        else:
+            pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False)
+            latents = self.encode_imgs(pred_rgb_256.to(self.dtype))
+        if step_ratio is not None:
+            # dreamtime-like
+            # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio)
+            t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step)
+            t = torch.full((batch_size,), t, dtype=torch.long, device=self.device)
+        else:
+            t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device)
+        w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1)
+        with torch.no_grad():
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            x_in = torch.cat([latents_noisy] * 2 * self.num_views)
+            t_in = torch.cat([t] * 2 * self.num_views)
+            cc_embs = []
+            vae_embs = []
+            noise_preds = []
+            for idx in range(self.num_views):
+                cond_RT = {
+                    "c2w": cond_cams[idx].c2w,
+                    "focal_length": cond_cams[idx].focal_length,
+                }
+                T = self.get_cam_embeddings_6D(target_RT, cond_RT)
+                cc_emb = torch.cat([self.embeddings[0][idx].repeat(batch_size, 1, 1), T], dim=-1)
+                cc_emb = self.pipe.clip_camera_projection(cc_emb)
+                cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0)
+                vae_emb = self.embeddings[1][idx].repeat(batch_size, 1, 1, 1)
+                vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0)
+                cc_embs.append(cc_emb)
+                vae_embs.append(vae_emb)
+            cc_emb = torch.cat(cc_embs, dim=0)
+            vae_emb = torch.cat(vae_embs, dim=0)
+            noise_pred = self.unet(
+                torch.cat([x_in, vae_emb], dim=1),
+                t_in.to(self.unet.dtype),
+                encoder_hidden_states=cc_emb,
+            ).sample
+            noise_pred_chunks = noise_pred.chunk(self.num_views)
+            for idx in range(self.num_views):
+                noise_pred_cond, noise_pred_uncond = noise_pred_chunks[idx][0], noise_pred_chunks[idx][1]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                noise_preds.append(noise_pred)
+        noise_pred = torch.stack(noise_preds).sum(dim=0) / len(noise_preds) # self.num_views # Average over all views
+        grad = w * (noise_pred - noise)
+        grad = torch.nan_to_num(grad)
+        target = (latents - grad).detach()
+        loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum')
+        return loss
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs, mode=False):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        if mode:
+            latents = posterior.mode()
+        else:
+            latents = posterior.sample()
+        latents = latents * self.vae.config.scaling_factor
+        return latents
+def process_im(im):
+    if im.shape[-1] == 3:
+        if self.bg_remover is None:
+            self.bg_remover = rembg.new_session()
+        im = rembg.remove(im, session=self.bg_remover)
+    im = im.astype(np.float32) / 255.0
+    input_mask = im[..., 3:]
+    input_img = im[..., :3] * input_mask + (1 - input_mask)
+    input_img = input_img[..., ::-1].copy()
+    image = torch.from_numpy(input_img).permute(2, 0, 1).unsqueeze(0).contiguous().to(device)
+    image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False)
+    return image
+def get_T_6d(target_RT, cond_RT, use_objaverse):
+    if use_objaverse:
+        new_row = torch.tensor([[0., 0., 0., 1.]])
+        T_target = torch.from_numpy(target_RT) # world to cam matrix
+        T_target = torch.cat((T_target, new_row), dim=0)
+        T_target = torch.linalg.inv(T_target) # Cam to world matrix
+        T_target[:3, :] = T_target[[1, 2, 0]]
+        T_cond = torch.from_numpy(cond_RT)
+        T_cond = torch.cat((T_cond, new_row), dim=0)
+        T_cond = torch.linalg.inv(T_cond)
+        T_cond[:3, :] = T_cond[[1, 2, 0]]
+        focal_len = torch.tensor([1., 1.])
+    else:
+        T_target = torch.from_numpy(target_RT["c2w"])
+        focal_len_target = torch.from_numpy(target_RT["focal_length"])
+        T_cond = torch.from_numpy(cond_RT["c2w"])
+        focal_len_cond = torch.from_numpy(cond_RT["focal_length"])
+        focal_len = focal_len_target / focal_len_cond
+    d_T = torch.linalg.inv(T_target) @ T_cond
+    d_T = torch.cat([d_T.flatten(), torch.log(focal_len)])
+    return d_T

sparseags/main_stage1.py ADDED Viewed

	@@ -0,0 +1,669 @@

+import os
+import cv2
+import sys
+import json
+import time
+import tqdm
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import rembg
+from liegroups.torch import SE3
+import sys
+sys.path.append('./')
+from sparseags.cam_utils import orbit_camera, OrbitCamera, mat2latlon, find_mask_center_and_translate
+from sparseags.render_utils.gs_renderer import Renderer, Camera, FoVCamera, CustomCamera
+from sparseags.mesh_utils.grid_put import mipmap_linear_grid_put_2d
+from sparseags.mesh_utils.mesh import Mesh, safe_normalize
+class GUI:
+    def __init__(self, opt):
+        self.opt = opt  # shared with the trainer's opt to support in-place modification of rendering parameters.
+        self.gui = opt.gui  # enable gui
+        self.W = opt.W
+        self.H = opt.H
+        self.mode = "image"
+        self.seed = 0
+        self.buffer_image = np.ones((self.W, self.H, 3), dtype=np.float32)
+        self.need_update = True  # update buffer_image
+        # models
+        self.device = torch.device("cuda")
+        self.bg_remover = None
+        self.guidance_sd = None
+        self.guidance_zero123 = None
+        self.guidance_dino = None
+        self.enable_sd = False
+        self.enable_zero123 = False
+        self.enable_dino = False
+        # renderer
+        self.renderer = Renderer(sh_degree=self.opt.sh_degree)
+        self.renderer.enable_dino = self.opt.lambda_dino > 0
+        self.renderer.gaussians.enable_dino = self.opt.lambda_dino > 0
+        self.renderer.gaussians.dino_feat_dim = 36
+        self.gaussain_scale_factor = 1
+        # input image
+        self.input_img = None
+        self.input_mask = None
+        self.input_img_torch = None
+        self.input_mask_torch = None
+        # training stuff
+        self.training = False
+        self.optimizer = None
+        self.step = 0
+        self.train_steps = 1  # steps per rendering loop
+        # load input data
+        self.load_input(self.opt.camera_path, self.opt.order_path)
+        self.cam = OrbitCamera(opt.W, opt.H, r=3, fovy=opt.fovy)
+        # override if provide a checkpoint
+        if self.opt.load is not None:
+            self.renderer.initialize(self.opt.load)
+        else:
+            # initialize gaussians to a blob
+            self.renderer.initialize(num_pts=self.opt.num_pts, radius=0.3, mode='sphere')  # 0.5 for radius 3
+            # initialize gaussians to a carved voxel
+            # self.renderer.initialize(num_pts=self.opt.num_pts, radius=0.5, cameras=self.cams, masks=self.input_mask, mode='carve')  # 0.5
+    def seed_everything(self):
+        try:
+            seed = int(self.seed)
+        except:
+            seed = np.random.randint(0, 1000000)
+        os.environ["PYTHONHASHSEED"] = str(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = True
+        self.last_seed = seed
+    def prepare_train(self):
+        self.step = 0
+        # setup training
+        self.renderer.gaussians.training_setup(self.opt)
+        # do progressive sh-level
+        self.renderer.gaussians.active_sh_degree = 0
+        self.optimizer = self.renderer.gaussians.optimizer
+        self.enable_sd = self.opt.lambda_sd > 0 and self.prompt != ""
+        self.enable_zero123 = self.opt.lambda_zero123 > 0 and self.input_img is not None
+        self.enable_dino = self.opt.lambda_dino > 0
+        # lazy load guidance model
+        if self.guidance_zero123 is None and self.enable_zero123:
+            print(f"[INFO] loading zero123...")
+            from sparseags.guidance_utils.zero123_6d_utils import Zero123
+            self.guidance_zero123 = Zero123(self.device, model_key='ashawkey/zero123-xl-diffusers')
+            print(f"[INFO] loaded zero123!")
+            self.guidance_zero123.opt = self.opt
+            self.guidance_zero123.num_views = self.num_views
+        # input image
+        if self.input_img is not None:
+            import torchvision.transforms as transforms
+            from PIL import Image
+            self.input_img_torch = torch.from_numpy(self.input_img).permute(0, 3, 1, 2).to(self.device)
+            self.input_mask_torch = torch.from_numpy(self.input_mask).permute(0, 3, 1, 2).to(self.device)
+        # prepare embeddings
+        with torch.no_grad():
+            if self.enable_zero123:
+                self.guidance_zero123.get_img_embeds(self.input_img_torch)
+    def train_step(self):
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+        starter.record()
+        for _ in range(self.train_steps):
+            self.step += 1
+            step_ratio = min(1, self.step / self.opt.iters)
+            # update lr
+            self.renderer.gaussians.update_learning_rate(self.step)
+            loss = 0
+            ### known view
+            for choice in range(self.num_views):
+                # For multiview training
+                cur_cam = self.cams[choice]
+                bg_size = self.renderer.gaussians.dino_feat_dim if self.enable_dino else 3
+                bg_color = torch.ones(
+                    bg_size,
+                    dtype=torch.float32,
+                    device="cuda",
+                )
+                out = self.renderer.render(cur_cam, bg_color=bg_color)
+                # rgb loss
+                image = out["image"]
+                loss = loss + 10000 * step_ratio * F.mse_loss(image, self.input_img_torch[choice])
+                # mask loss
+                mask = out["alpha"]
+                loss = loss + 1000 * step_ratio * F.mse_loss(mask, self.input_mask_torch[choice])
+                # dino loss
+                if self.enable_dino:
+                    feature = out["feature"]
+                    loss = loss + 1000 * step_ratio * F.mse_loss(feature, self.guidance_dino.embeddings[choice])
+            ### novel view (manual batch)
+            render_resolution = 128 if step_ratio < 0.3 else (256 if step_ratio < 0.6 else 512)
+            images = []
+            masks = []
+            vers, hors, radii = [], [], []
+            # avoid too large elevation (> 80 or < -80)
+            min_ver = max(-60 + np.array(self.opt.ref_polars).min(), -80)  # + - 30 for co3D
+            max_ver = min(60 + np.array(self.opt.ref_polars).max(), 80)
+            for _ in range(self.opt.batch_size):
+                # render random view
+                ver = np.random.randint(min_ver, max_ver) - self.opt.ref_polars[0]
+                hor = np.random.randint(-180, 180)
+                radius = 0
+                vers.append(ver)
+                hors.append(hor)
+                radii.append(radius)
+                pose = orbit_camera(
+                    self.opt.ref_polars[0] + ver,
+                    self.opt.ref_azimuths[0] + hor,
+                    np.array(self.opt.ref_radii).mean() + radius,
+                )
+                # Azimuth
+                # [-180, -135): -4, [-135, -90): -3, [-90, -45): -2, [-45, 0): -1
+                # [0, 45): 0, [45, 90): 1, [90, 135): 2, [135, 180): 3.
+                # Elevation: [0, 90): 0 [-90, 0): 1
+                idx_ver, idx_hor = int((self.opt.ref_polars[0]+ver) < 0), hor // 45
+                flag = 0
+                cx, cy = self.pp_pools[idx_ver, idx_hor+4].tolist()
+                cnt = 0
+                fx, fy = self.fx, self.fy
+                # in each iter we modify cx, cy, fx, fy to make sure the rendered object is at the center and has a reasonable size
+                while not flag:
+                    if cnt >= 10:
+                        # print(f"[ERROR] Something might be wrong here!")
+                        break
+                    flag_principal_point, flag_focal_length = 0, 0
+                    # we modified the field of view. Otherwise, the rendered object will be too small
+                    # cur_cam = FoVCamera(pose, render_resolution, render_resolution, self.fovy, self.fovx, self.cam.near, self.cam.far)
+                    cur_cam = Camera(pose, render_resolution, render_resolution, fx, fy, cx, cy, self.cam.near, self.cam.far)
+                    bg_size = self.renderer.gaussians.dino_feat_dim if self.enable_dino else 3
+                    bg_color = torch.ones(bg_size, dtype=torch.float32, device="cuda") if np.random.rand() > self.opt.invert_bg_prob else torch.zeros(bg_size, dtype=torch.float32, device="cuda")
+                    out = self.renderer.render(cur_cam, bg_color=bg_color)
+                    image = out["image"].unsqueeze(0)
+                    mask = out["alpha"].unsqueeze(0)
+                    delta_xy = find_mask_center_and_translate(image.detach(), mask.detach()) / render_resolution * 256
+                    # (1) check if the principal points are appropriate
+                    if delta_xy[0].abs() > 10 or delta_xy[1].abs() > 10:
+                        cx -= delta_xy[0]
+                        cy -= delta_xy[1]
+                        self.pp_pools[idx_ver, idx_hor+4] = torch.tensor([cx, cy])  # Update pp_pools
+                    else:
+                        flag_principal_point = 1
+                    num_pixs_mask = (mask > 0.5).float().sum().item()
+                    target_num_pixs = render_resolution ** 2 / (1.2 ** 2)
+                    mask_to_compute = (mask > 0.5).squeeze().detach().cpu().numpy()
+                    y_indices, x_indices = np.where(mask_to_compute > 0)
+                    if len(x_indices) == 0 or len(y_indices) == 0:
+                        # return None or some indication that there's no object in the mask
+                        continue
+                    # find the bounding box coordinates
+                    x1, y1 = np.min(x_indices), np.min(y_indices)
+                    x2, y2 = np.max(x_indices), np.max(y_indices)
+                    bbox =  np.array([x1, y1, x2, y2])
+                    extents = (bbox[2:] - bbox[:2]).max()
+                    num_pixs_mask = extents ** 2
+                    # (2) check if the focal lengths are appropriate
+                    if abs(num_pixs_mask - target_num_pixs) > 0.05 * render_resolution ** 2:
+                        if num_pixs_mask == 0:
+                            pass
+                        else:
+                            fx = fx * np.sqrt(target_num_pixs / num_pixs_mask)
+                            fy = fy * np.sqrt(target_num_pixs / num_pixs_mask)
+                    else:
+                        flag_focal_length = 1
+                    if flag_principal_point * flag_focal_length == 1:
+                        flag = 1
+                    cnt += 1
+                images.append(image)
+                masks.append(mask)
+            images = torch.cat(images, dim=0)
+            if self.enable_zero123:
+                target_RT = {
+                    "c2w": pose,
+                    "focal_length": np.array(fx, fy),
+                }
+                loss = loss + self.opt.lambda_zero123 * self.guidance_zero123.batch_train_step(images, target_RT, self.cams, step_ratio=step_ratio if self.opt.anneal_timestep else None)
+            if self.enable_dino:
+                loss_dino = self.guidance_dino.train_step(
+                    images,
+                    out["feature"],
+                    step_ratio=step_ratio if self.opt.anneal_timestep else None
+                )
+                loss = loss + self.opt.lambda_dino * loss_dino
+            # optimize step
+            loss.backward()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            latlons = [mat2latlon(cam.c2w[:3, 3]) for cam in self.cams]
+            if self.opt.opt_cam:
+                for i, cam in enumerate(self.cams):
+                    w2c = cam.w2c @ SE3.exp(cam.cam_params.detach()).as_matrix()
+                    w2c[:2, :3] *= -1
+                    w2c[:2, 3] *= -1
+                    self.camera_tracks[i].append(w2c.tolist())
+            self.opt.ref_polars = [float(cam[0]) for cam in latlons]
+            self.opt.ref_azimuths = [float(cam[1]) for cam in latlons]
+            self.opt.ref_radii = [float(cam[2]) for cam in latlons]
+            # densify and prune
+            if self.step >= self.opt.density_start_iter and self.step <= self.opt.density_end_iter:
+                viewspace_point_tensor, visibility_filter, radii = out["viewspace_points"], out["visibility_filter"], out["radii"]
+                self.renderer.gaussians.max_radii2D[visibility_filter] = torch.max(self.renderer.gaussians.max_radii2D[visibility_filter], radii[visibility_filter])
+                self.renderer.gaussians.add_densification_stats(viewspace_point_tensor, visibility_filter)
+                if self.step % self.opt.densification_interval == 0:
+                    self.renderer.gaussians.densify_and_prune(self.opt.densify_grad_threshold, min_opacity=0.01, extent=4, max_screen_size=1)
+                # if self.step % self.opt.opacity_reset_interval == 0:
+                #     self.renderer.gaussians.reset_opacity()
+                if self.step % 100 == 0 and self.renderer.gaussians.max_sh_degree != 0:
+                    self.renderer.gaussians.oneupSHdegree()
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        self.need_update = True
+    def load_input(self, camera_path, order_path=None):
+        # load image
+        print(f'[INFO] load data from {camera_path}...')
+        if order_path is not None:
+            with open(order_path, 'r') as f:
+                indices = json.load(f)
+        else:
+            indices = None
+        with open(camera_path, 'r') as f:
+            data = json.load(f)
+        self.cam_params = {}
+        for k, v in data.items():
+            if indices is None:
+                self.cam_params[k] = data[k]
+            else:
+                if int(k) in indices or k in indices:
+                    self.cam_params[k] = data[k]
+        if self.opt.all_views:
+            for k, v in self.cam_params.items():
+                self.cam_params[k]['opt_cam'] = 1
+                self.cam_params[k]['flag'] = 1
+        else:
+            for k, v in self.cam_params.items():
+                if int(self.cam_params[k]['flag']):
+                    self.cam_params[k]['opt_cam'] = 1
+                else:
+                    self.cam_params[k]['opt_cam'] = 0
+        img_paths = [v["filepath"] for k, v in self.cam_params.items() if v["flag"]]
+        self.num_views = len(img_paths)
+        print(f"[INFO] Number of views: {self.num_views}")
+        for filepath in img_paths:
+            print(filepath)
+        images, masks = [], []
+        for i in range(self.num_views):
+            img = cv2.imread(img_paths[i], cv2.IMREAD_UNCHANGED)
+            if img.shape[-1] == 3:
+                if self.bg_remover is None:
+                    self.bg_remover = rembg.new_session()
+                img = rembg.remove(img, session=self.bg_remover)
+            img = img.astype(np.float32) / 255.0
+            # Non-integer cropping creates non-zero mask values
+            input_mask = (img[..., 3:] > 0.5).astype(np.float32)
+            # white bg
+            input_img = img[..., :3] * input_mask + (1 - input_mask)
+            # bgr to rgb
+            input_img = input_img[..., ::-1].copy()
+            images.append(input_img), masks.append(input_mask)
+        images = np.stack(images, axis=0)
+        masks = np.stack(masks, axis=0)
+        self.input_img = images[:self.num_views]
+        self.input_mask = masks[:self.num_views]
+        self.all_input_images = images
+        self.cams = [CustomCamera(v, index=int(k), opt_pose=self.opt.opt_cam and v['opt_cam']) for k, v in self.cam_params.items() if v["flag"]]
+        cam_centers = [mat2latlon(cam.camera_center) for cam in self.cams]
+        self.opt.ref_polars = [float(cam[0]) for cam in cam_centers]
+        self.opt.ref_azimuths = [float(cam[1]) for cam in cam_centers]
+        self.opt.ref_radii = [float(cam[2]) for cam in cam_centers]
+        self.fx = np.array([cam.fx for cam in self.cams], dtype=np.float32).mean()
+        self.fy = np.array([cam.fy for cam in self.cams], dtype=np.float32).mean()
+        self.cx = 128
+        self.cy = 128
+        if self.opt.opt_cam:
+            self.camera_tracks = {}
+            for i, cam in enumerate(self.cams):
+                self.camera_tracks[i] = []
+        # Azimuth Mapping: [-180, -135): -4, [-135, -90): -3, [-90, -45): -2, [-45, 0): -1,
+        #                   [0, 45): 0, [45, 90): 1, [90, 135): 2, [135, 180): 3.
+        # Elevation Mapping: [0, 90): 0, [-90, 0): 1.
+        # Principal Point Pool: Tensor (2, 8, 2), where:
+        #   - 2: Elevation groups, 8: Azimuth intervals, 2: x, y coordinates (init to 128).
+        # we created a "pool" for principal points
+        # we use these principal points to render image to make sure object is at the center
+        self.pp_pools = torch.full((2, 8, 2), 128)
+        if self.opt.opt_cam:
+            self.renderer.gaussians.cam_params = [cam.cam_params for cam in self.cams[:] if cam.opt_pose]
+    @torch.no_grad()
+    def save_video(self, post_fix=None):
+        xyz = self.renderer.gaussians._xyz
+        center = self.renderer.gaussians._xyz.mean(dim=0)
+        squared_distances = torch.sum((xyz - center) ** 2, dim=1)
+        max_distance_squared = torch.max(squared_distances)
+        radius = torch.sqrt(max_distance_squared) + 1.0
+        radius = radius.detach().cpu().numpy()
+        render_resolution = 256
+        images = []
+        frame_rate = 30
+        image_size = (render_resolution, render_resolution)  # Size of each image
+        video_path = self.opt.save_path + f'_rendered_video_{post_fix}.mp4'
+        azimuth = np.arange(0, 360, 3, dtype=np.int32)
+        for azi in tqdm.tqdm(azimuth):
+            target = center.detach().cpu().numpy()
+            pose = orbit_camera(-30, azi, radius, target=target)
+            cur_cam = FoVCamera(
+                pose,
+                render_resolution,
+                render_resolution,
+                self.cam.fovy,
+                self.cam.fovx,
+                self.cam.near,
+                self.cam.far,
+            )
+            out = self.renderer.render(cur_cam)
+            img = out["image"].detach().cpu().numpy() # [3, H, W] in [0, 1]
+            img = np.transpose(img, (1, 2, 0))
+            image = (img * 255).astype(np.uint8)
+            images.append(image)
+        images = np.stack(images, axis=0)
+        # ~4 seconds, 120 frames at 30 fps
+        import imageio
+        imageio.mimwrite(video_path, images, fps=30, quality=8, macro_block_size=1)
+    @torch.no_grad()
+    def save_model(self, mode='geo', texture_size=1024):
+        os.makedirs(self.opt.outdir, exist_ok=True)
+        if mode == 'geo':
+            path = os.path.join(self.opt.outdir, self.opt.save_path + '_mesh.ply')
+            mesh = self.renderer.gaussians.extract_mesh(path, self.opt.density_thresh)
+            mesh.write_ply(path)
+        elif mode == 'geo+tex':
+            path = os.path.join(self.opt.outdir, self.opt.save_path + '_mesh.' + self.opt.mesh_format)
+            mesh = self.renderer.gaussians.extract_mesh(path, self.opt.density_thresh)
+            # perform texture extraction
+            print(f"[INFO] unwrap uv...")
+            h = w = texture_size
+            mesh.auto_uv()
+            mesh.auto_normal()
+            albedo = torch.zeros((h, w, 3), device=self.device, dtype=torch.float32)
+            cnt = torch.zeros((h, w, 1), device=self.device, dtype=torch.float32)
+            if self.enable_dino:
+                feature = torch.zeros((h, w, self.renderer.gaussians.dino_feat_dim), device=self.device, dtype=torch.float32)
+            # self.prepare_train() # tmp fix for not loading 0123
+            # vers = [0]
+            # hors = [0]
+            vers = [0] * 8 + [-45] * 8 + [45] * 8 + [-89.9, 89.9]
+            hors = [0, 45, -45, 90, -90, 135, -135, 180] * 3 + [0, 0]
+            render_resolution = 512
+            import nvdiffrast.torch as dr
+            if not self.opt.force_cuda_rast and (not self.opt.gui or os.name == 'nt'):
+                glctx = dr.RasterizeGLContext()
+            else:
+                glctx = dr.RasterizeCudaContext()
+            for ver, hor in zip(vers, hors):
+                # render image
+                pose = orbit_camera(ver, hor, self.cam.radius)
+                cur_cam = FoVCamera(
+                    pose,
+                    render_resolution,
+                    render_resolution,
+                    self.cam.fovy,
+                    self.cam.fovx,
+                    self.cam.near,
+                    self.cam.far,
+                )
+                cur_out = self.renderer.render(cur_cam)
+                rgbs = cur_out["image"].unsqueeze(0) # [1, 3, H, W] in [0, 1]
+                if self.enable_dino:
+                    features = cur_out["feature"].unsqueeze(0) # [1, 384, 512, 512]
+                # enhance texture quality with zero123 [not working well]
+                # if self.opt.guidance_model == 'zero123':
+                #     rgbs = self.guidance.refine(rgbs, [ver], [hor], [0])
+                    # import kiui
+                    # kiui.vis.plot_image(rgbs)
+                # get coordinate in texture image
+                pose = torch.from_numpy(pose.astype(np.float32)).to(self.device)
+                proj = torch.from_numpy(self.cam.perspective.astype(np.float32)).to(self.device)
+                v_cam = torch.matmul(F.pad(mesh.v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
+                v_clip = v_cam @ proj.T
+                rast, rast_db = dr.rasterize(glctx, v_clip, mesh.f, (render_resolution, render_resolution))
+                depth, _ = dr.interpolate(-v_cam[..., [2]], rast, mesh.f) # [1, H, W, 1]
+                depth = depth.squeeze(0) # [H, W, 1]
+                alpha = (rast[0, ..., 3:] > 0).float()
+                uvs, _ = dr.interpolate(mesh.vt.unsqueeze(0), rast, mesh.ft)  # [1, 512, 512, 2] in [0, 1]
+                # use normal to produce a back-project mask
+                normal, _ = dr.interpolate(mesh.vn.unsqueeze(0).contiguous(), rast, mesh.fn)
+                normal = safe_normalize(normal[0])
+                # rotated normal (where [0, 0, 1] always faces camera)
+                rot_normal = normal @ pose[:3, :3]
+                viewcos = rot_normal[..., [2]]
+                mask = (alpha > 0) & (viewcos > 0.5)  # [H, W, 1]
+                mask = mask.view(-1)
+                uvs = uvs.view(-1, 2).clamp(0, 1)[mask]
+                rgbs = rgbs.view(3, -1).permute(1, 0)[mask].contiguous()
+                # update texture image
+                cur_albedo, cur_cnt = mipmap_linear_grid_put_2d(
+                    h, w,
+                    uvs[..., [1, 0]] * 2 - 1,
+                    rgbs,
+                    min_resolution=256,
+                    return_count=True,
+                )
+                if self.enable_dino:
+                    features = features.view(features.shape[1], -1).permute(1, 0)[mask].contiguous()
+                    cur_feature, _ = mipmap_linear_grid_put_2d(
+                    h, w,
+                    uvs[..., [1, 0]] * 2 - 1,
+                    features,
+                    min_resolution=256,
+                    return_count=True,
+                )
+                # albedo += cur_albedo
+                # cnt += cur_cnt
+                mask = cnt.squeeze(-1) < 0.1
+                albedo[mask] += cur_albedo[mask]
+                cnt[mask] += cur_cnt[mask]
+                if self.enable_dino:
+                    feature[mask] += cur_feature[mask]
+            mask = cnt.squeeze(-1) > 0
+            albedo[mask] = albedo[mask] / cnt[mask].repeat(1, 3)
+            if self.enable_dino:
+                feature[mask] = feature[mask] / cnt[mask].repeat(1, feature.shape[-1])
+            mask = mask.view(h, w)
+            albedo = albedo.detach().cpu().numpy()
+            mask = mask.detach().cpu().numpy()
+            if self.enable_dino:
+                feature = feature.detach().cpu().numpy()
+            # dilate texture
+            from sklearn.neighbors import NearestNeighbors
+            from scipy.ndimage import binary_dilation, binary_erosion
+            inpaint_region = binary_dilation(mask, iterations=32)
+            inpaint_region[mask] = 0
+            search_region = mask.copy()
+            not_search_region = binary_erosion(search_region, iterations=3)
+            search_region[not_search_region] = 0
+            search_coords = np.stack(np.nonzero(search_region), axis=-1)
+            inpaint_coords = np.stack(np.nonzero(inpaint_region), axis=-1)
+            knn = NearestNeighbors(n_neighbors=1, algorithm="kd_tree").fit(
+                search_coords
+            )
+            _, indices = knn.kneighbors(inpaint_coords)
+            albedo[tuple(inpaint_coords.T)] = albedo[tuple(search_coords[indices[:, 0]].T)]
+            mesh.albedo = torch.from_numpy(albedo).to(self.device)
+            # mesh.write(path)
+            if self.enable_dino:
+                feature[tuple(inpaint_coords.T)] = feature[tuple(search_coords[indices[:, 0]].T)]
+                mesh.feature = torch.from_numpy(feature).to(self.device)
+            mesh.write(path, self.enable_dino)
+        else:
+            path = os.path.join(self.opt.outdir, self.opt.save_path + '_model.ply')
+            self.renderer.gaussians.save_ply(path)
+        print(f"[INFO] save model to {path}.")
+    # no gui mode
+    def train(self, iters=500):
+        if iters > 0:
+            self.prepare_train()
+            for i in tqdm.trange(iters):
+                self.train_step()
+            # do a last prune
+            self.renderer.gaussians.prune(min_opacity=0.01, extent=1, max_screen_size=1)
+        if self.opt.opt_cam:
+            for cam in self.cams:
+                try:
+                    self.cam_params[str(cam.index)]["R"] = cam.rotation.tolist()
+                    self.cam_params[str(cam.index)]["T"] = cam.translation.tolist()
+                except KeyError:
+                    self.cam_params[f"{cam.index:03}"]["R"] = cam.rotation.tolist()
+                    self.cam_params[f"{cam.index:03}"]["T"] = cam.translation.tolist()
+        with open(self.opt.camera_path.replace(".json", "_updated.json"), "w") as file:
+            json.dump(self.cam_params, file, indent=4)
+        self.save_model(mode='model')
+        self.save_model(mode='geo+tex')
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="path to the yaml config file")
+    args, extras = parser.parse_known_args()
+    # override default config from cli
+    opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+    gui = GUI(opt)
+    gui.train(opt.iters)

sparseags/main_stage2.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import os
+import cv2
+import json
+import time
+import copy
+import tqdm
+import rembg
+import trimesh
+import torch
+import torch.nn.functional as F
+import numpy as np
+import pandas as pd
+from kiui.lpips import LPIPS
+import sys
+sys.path.append('./')
+from sparseags.cam_utils import orbit_camera, OrbitCamera, mat2latlon, find_mask_center_and_translate
+from sparseags.render_utils.gs_renderer import CustomCamera
+from sparseags.mesh_utils.mesh_renderer import Renderer
+class GUI:
+    def __init__(self, opt):
+        self.opt = opt  # shared with the trainer's opt to support in-place modification of rendering parameters.
+        self.gui = opt.gui  # enable gui
+        self.W = opt.W
+        self.H = opt.H
+        self.mode = "image"
+        self.seed = 0
+        self.buffer_image = np.ones((self.W, self.H, 3), dtype=np.float32)
+        self.need_update = True  # update buffer_image
+        # models
+        self.device = torch.device("cuda")
+        self.bg_remover = None
+        self.guidance_sd = None
+        self.guidance_zero123 = None
+        self.guidance_dino = None
+        self.enable_sd = False
+        self.enable_zero123 = False
+        self.enable_dino = False
+        # renderer
+        self.renderer = Renderer(opt).to(self.device)
+        # input image
+        self.input_img = None
+        self.input_mask = None
+        self.input_img_torch = None
+        self.input_mask_torch = None
+        self.overlay_input_img = False
+        self.overlay_input_img_ratio = 0.5
+        # input text
+        self.prompt = ""
+        self.negative_prompt = ""
+        # training stuff
+        self.training = False
+        self.optimizer = None
+        self.step = 0
+        self.train_steps = 1  # steps per rendering loop
+        # load input data
+        self.load_input(self.opt.camera_path, self.opt.order_path)
+        # override prompt from cmdline
+        if self.opt.prompt is not None:
+            self.prompt = self.opt.prompt
+        if self.opt.negative_prompt is not None:
+            self.negative_prompt = self.opt.negative_prompt
+    def seed_everything(self):
+        try:
+            seed = int(self.seed)
+        except:
+            seed = np.random.randint(0, 1000000)
+        os.environ["PYTHONHASHSEED"] = str(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = True
+        self.last_seed = seed
+    def prepare_train(self):
+        self.step = 0
+        # setup training
+        self.optimizer = torch.optim.Adam(self.renderer.get_params())
+        cameras = [CustomCamera(v, index=int(k)) for k, v in self.cam_params.items() if v["flag"]]
+        cam_centers = [mat2latlon(cam.camera_center) for cam in cameras]
+        self.opt.ref_polars = [float(cam[0]) for cam in cam_centers]
+        self.opt.ref_azimuths = [float(cam[1]) for cam in cam_centers]
+        self.opt.ref_radii = [float(cam[2]) for cam in cam_centers]
+        self.cams = [(cam.c2w, cam.perspective, cam.focal_length) for cam in cameras]
+        self.cam = copy.deepcopy(cameras[0])
+        # Azimuth Mapping: [-180, -135): -4, [-135, -90): -3, [-90, -45): -2, [-45, 0): -1,
+        #                   [0, 45): 0, [45, 90): 1, [90, 135): 2, [135, 180): 3.
+        # Elevation Mapping: [0, 90): 0, [-90, 0): 1.
+        # Principal Point Pool: Tensor (2, 8, 2), where:
+        #   - 2: Elevation groups, 8: Azimuth intervals, 2: x, y coordinates (init to 128).
+        # we created a "pool" for principal points
+        # we use these principal points to render image to make sure object is at the center
+        self.pp_pools = torch.full((2, 8, 2), 128)
+        # The intrinsics is the average over all cams
+        self.cam.fx = np.array([cam.fx for cam in cameras], dtype=np.float32).mean()
+        self.cam.fy = np.array([cam.fy for cam in cameras], dtype=np.float32).mean()
+        self.cam.cx = np.array([cam.cx for cam in cameras], dtype=np.float32).mean()
+        self.cam.cy = np.array([cam.cy for cam in cameras], dtype=np.float32).mean()
+        self.enable_sd = self.opt.lambda_sd > 0 and self.prompt != ""
+        self.enable_zero123 = self.opt.lambda_zero123 > 0 and self.input_img is not None
+        self.enable_dino = self.opt.lambda_dino > 0
+        # lazy load guidance model
+        if self.guidance_sd is None and self.enable_sd:
+            if self.opt.mvdream:
+                print(f"[INFO] loading MVDream...")
+                from guidance.mvdream_utils import MVDream
+                self.guidance_sd = MVDream(self.device)
+                print(f"[INFO] loaded MVDream!")
+            else:
+                print(f"[INFO] loading SD...")
+                from guidance.sd_utils import StableDiffusion
+                self.guidance_sd = StableDiffusion(self.device)
+                print(f"[INFO] loaded SD!")
+        if self.guidance_zero123 is None and self.enable_zero123:
+            print(f"[INFO] loading zero123...")
+            from sparseags.guidance_utils.zero123_6d_utils import Zero123
+            self.guidance_zero123 = Zero123(self.device, model_key='ashawkey/zero123-xl-diffusers')
+            print(f"[INFO] loaded zero123!")
+        if self.guidance_dino is None and self.enable_dino:
+            print(f"[INFO] loading dino...")
+            from guidance.dino_utils import Dino
+            self.guidance_dino = Dino(self.device, n_components=36, model_key="dinov2_vits14")
+            self.guidance_dino.fit_pca(self.all_input_images)
+            print(f"[INFO] loaded dino!")
+        # input image
+        if self.input_img is not None:
+            self.input_img_torch = torch.from_numpy(self.input_img).permute(0, 3, 1, 2).to(self.device)
+            self.input_img_torch = F.interpolate(self.input_img_torch, (self.opt.ref_size, self.opt.ref_size), mode="bilinear", align_corners=False)
+            self.input_mask_torch = torch.from_numpy(self.input_mask).permute(0, 3, 1, 2).to(self.device)
+            self.input_mask_torch = F.interpolate(self.input_mask_torch, (self.opt.ref_size, self.opt.ref_size), mode="bilinear", align_corners=False)
+            self.input_img_torch_channel_last = self.input_img_torch.permute(0, 2, 3, 1).contiguous()
+        # prepare embeddings
+        with torch.no_grad():
+            if self.enable_sd:
+                self.guidance_sd.get_text_embeds([self.prompt], [self.negative_prompt])
+            if self.enable_zero123:
+                self.guidance_zero123.get_img_embeds(self.input_img_torch)
+            if self.enable_dino:
+                self.guidance_dino.embeddings = self.guidance_dino.get_dino_embeds(self.input_img_torch, upscale=True, reduced=True, learned_up=True)  # [8, 18, 18, 36]
+    def train_step(self):
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+        starter.record()
+        for _ in range(self.train_steps):
+            self.step += 1
+            step_ratio = min(1, self.step / self.opt.iters_refine)
+            loss = 0
+            ### known view
+            for choice in range(self.num_views):
+                ssaa = min(2.0, max(0.125, 2 * np.random.random()))
+                out = self.renderer.render(*self.cams[choice][:2], self.opt.ref_size, self.opt.ref_size, ssaa=ssaa)
+                # rgb loss
+                image = out["image"] # [H, W, 3] in [0, 1]
+                valid_mask = (out["alpha"] > 0).detach()
+                loss = loss + F.mse_loss(image * valid_mask, self.input_img_torch_channel_last[choice] * valid_mask)
+                if self.enable_dino:
+                    feature = out["feature"]
+                    loss = loss + F.mse_loss(feature * valid_mask, self.guidance_dino.embeddings[choice] * valid_mask)
+            ### novel view (manual batch)
+            render_resolution = 512
+            images = []
+            vers, hors, radii = [], [], []
+            # avoid too large elevation (> 80 or < -80), and make sure it always cover [-30, 30]
+            # min_ver = max(min(-30, -30 - self.opt.elevation), -80 - self.opt.elevation)
+            # max_ver = min(max(30, 30 - self.opt.elevation), 80 - self.opt.elevation)
+            # min_ver = max(min(-30, -30 + np.array(self.opt.ref_polars).min()), -80)
+            # max_ver = min(max(30, 30 + np.array(self.opt.ref_polars).max()), 80)
+            min_ver = max(-30 + np.array(self.opt.ref_polars).min(), -80)
+            max_ver = min(30 + np.array(self.opt.ref_polars).max(), 80)
+            for _ in range(self.opt.batch_size):
+                # render random view
+                ver = np.random.randint(min_ver, max_ver) - self.opt.ref_polars[0]
+                hor = np.random.randint(-180, 180)
+                radius = 0
+                vers.append(ver)
+                hors.append(hor)
+                radii.append(radius)
+                pose = orbit_camera(self.opt.ref_polars[0] + ver, self.opt.ref_azimuths[0] + hor, np.array(self.opt.ref_radii).mean() + radius)
+                # random render resolution
+                ssaa = min(2.0, max(0.125, 2 * np.random.random()))
+                # Azimuth
+                # [-180, -135): -4, [-135, -90): -3, [-90, -45): -2, [-45, 0): -1
+                # [0, 45): 0, [45, 90): 1, [90, 135): 2, [135, 180): 3.
+                # Elevation: [0, 90): 0 [-90, 0): 1
+                idx_ver, idx_hor = int((self.opt.ref_polars[0]+ver) < 0), hor // 45
+                flag = 0
+                cx, cy = self.pp_pools[idx_ver, idx_hor+4].tolist()
+                cnt = 0
+                while not flag:
+                    self.cam.cx = cx
+                    self.cam.cy = cy
+                    if cnt >= 5:
+                        print(f"[ERROR] Something must be wrong here!")
+                        break
+                    # We modified the field of view. Otherwise, the rendered object will be too small
+                    out = self.renderer.render(pose, self.cam.perspective, render_resolution, render_resolution, ssaa=ssaa)
+                    image = out["image"]
+                    image = image.permute(2, 0, 1).contiguous().unsqueeze(0)
+                    mask = out["alpha"] > 0
+                    mask = mask.permute(2, 0, 1).contiguous().unsqueeze(0)
+                    delta_xy = find_mask_center_and_translate(image.detach(), mask.detach()) / render_resolution * 256
+                    if delta_xy[0].abs() > 10 or delta_xy[1].abs() > 10:
+                        cx -= delta_xy[0]
+                        cy -= delta_xy[1]
+                        self.pp_pools[idx_ver, idx_hor+4] = torch.tensor([cx, cy])  # Update pp_pools
+                        cnt += 1
+                    else:
+                        flag = 1
+                images.append(image)
+            images = torch.cat(images, dim=0)
+            # guidance loss
+            strength = step_ratio * 0.15 + 0.8
+            if self.enable_zero123:
+                v1 = torch.stack([torch.tensor([radius]) + self.opt.ref_radii[0], torch.deg2rad(torch.tensor([ver]) + self.opt.ref_polars[0]), torch.deg2rad(torch.tensor([hor]) + self.opt.ref_azimuths[0])], dim=-1)   # polar,azimuth,radius are all actually delta wrt default
+                v2 = torch.stack([torch.tensor(self.opt.ref_radii), torch.deg2rad(torch.tensor(self.opt.ref_polars)), torch.deg2rad(torch.tensor(self.opt.ref_azimuths))], dim=-1)
+                angles = torch.rad2deg(self.guidance_zero123.angle_between(v1, v2)).to(self.device)
+                choice = torch.argmin(angles.squeeze()).item()
+                cond_RT = {
+                    "c2w": self.cams[choice][0],
+                    "focal_length": self.cams[choice][-1],
+                }
+                target_RT = {
+                    "c2w": pose,
+                    "focal_length": np.array(self.cam.fx, self.cam.fy),
+                }
+                cam_embed = self.guidance_zero123.get_cam_embeddings_6D(target_RT, cond_RT)
+                # Additionally add an idx parameter to choose the correct viewpoints
+                refined_images = self.guidance_zero123.refine(images, cam_embed, strength=strength, idx=choice).float()
+                refined_images = F.interpolate(refined_images, (render_resolution, render_resolution), mode="bilinear", align_corners=False)
+                loss = loss + self.opt.lambda_zero123 * F.mse_loss(images, refined_images)
+            if self.enable_dino:
+                loss_dino = self.guidance_dino.train_step(
+                    images,
+                    out["feature"].permute(2, 0, 1).contiguous(),
+                    step_ratio=step_ratio if self.opt.anneal_timestep else None
+                )
+                loss = loss + self.opt.lambda_dino * loss_dino
+            # optimize step
+            loss.backward()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        self.need_update = True
+    def load_input(self, camera_path, order_path=None):
+        # load image
+        print(f'[INFO] load data from {camera_path}...')
+        if order_path is not None:
+            with open(order_path, 'r') as f:
+                indices = json.load(f)
+        else:
+            indices = None
+        with open(camera_path, 'r') as f:
+            data = json.load(f)
+        self.cam_params = {}
+        for k, v in data.items():
+            if indices is None:
+                self.cam_params[k] = data[k]
+            else:
+                if int(k) in indices or k in indices:
+                    self.cam_params[k] = data[k]
+            if self.opt.all_views:
+                v['flag'] = 1
+        img_paths = [v["filepath"] for k, v in self.cam_params.items() if v["flag"]]
+        self.num_views = len(img_paths)
+        print(f"[INFO] Number of views: {self.num_views}")
+        for filepath in img_paths:
+            print(filepath)
+        images, masks = [], []
+        for i in range(len(img_paths)):
+            img = cv2.imread(img_paths[i], cv2.IMREAD_UNCHANGED)
+            if img.shape[-1] == 3:
+                if self.bg_remover is None:
+                    self.bg_remover = rembg.new_session()
+                img = rembg.remove(img, session=self.bg_remover)
+            img = cv2.resize(img, (self.W, self.H), interpolation=cv2.INTER_AREA)
+            img = img.astype(np.float32) / 255.0
+            input_mask = img[..., 3:]
+            # white bg
+            input_img = img[..., :3] * input_mask + (1 - input_mask)
+            # bgr to rgb
+            input_img = input_img[..., ::-1].copy()
+            images.append(input_img), masks.append(input_mask)
+        images = np.stack(images, axis=0)
+        masks = np.stack(masks, axis=0)
+        self.input_img = images[:self.num_views]
+        self.input_mask = masks[:self.num_views]
+        self.all_input_images = images
+    def save_model(self):
+        os.makedirs(self.opt.outdir, exist_ok=True)
+        path = os.path.join(self.opt.outdir, self.opt.save_path + '.' + self.opt.mesh_format)
+        self.renderer.export_mesh(path)
+        print(f"[INFO] save model to {path}.")
+    # no gui mode
+    def train(self, iters=500):
+        if iters > 0:
+            self.prepare_train()
+            for i in tqdm.trange(iters):
+                self.train_step()
+        # save
+        self.save_model()
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="path to the yaml config file")
+    args, extras = parser.parse_known_args()
+    # override default config from cli
+    opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+    # auto find mesh from stage 1
+    if opt.mesh is None:
+        default_path = os.path.join(opt.outdir, opt.save_path + '_mesh.' + opt.mesh_format)
+        if os.path.exists(default_path):
+            opt.mesh = default_path
+        else:
+            raise ValueError(f"Cannot find mesh from {default_path}, must specify --mesh explicitly!")
+    gui = GUI(opt)
+    gui.train(opt.iters_refine)

sparseags/mesh_utils/grid_put.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import torch
+import torch.nn.functional as F
+def stride_from_shape(shape):
+    stride = [1]
+    for x in reversed(shape[1:]):
+        stride.append(stride[-1] * x)
+    return list(reversed(stride))
+def scatter_add_nd(input, indices, values):
+    # input: [..., C], D dimension + C channel
+    # indices: [N, D], long
+    # values: [N, C]
+    D = indices.shape[-1]
+    C = input.shape[-1]
+    size = input.shape[:-1]
+    stride = stride_from_shape(size)
+    assert len(size) == D
+    input = input.view(-1, C)  # [HW, C]
+    flatten_indices = (indices * torch.tensor(stride, dtype=torch.long, device=indices.device)).sum(-1)  # [N]
+    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
+    return input.view(*size, C)
+def scatter_add_nd_with_count(input, count, indices, values, weights=None):
+    # input: [..., C], D dimension + C channel
+    # count: [..., 1], D dimension
+    # indices: [N, D], long
+    # values: [N, C]
+    D = indices.shape[-1]
+    C = input.shape[-1]
+    size = input.shape[:-1]
+    stride = stride_from_shape(size)
+    assert len(size) == D
+    input = input.view(-1, C)  # [HW, C]
+    count = count.view(-1, 1)
+    flatten_indices = (indices * torch.tensor(stride, dtype=torch.long, device=indices.device)).sum(-1)  # [N]
+    if weights is None:
+        weights = torch.ones_like(values[..., :1])
+    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
+    count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
+    return input.view(*size, C), count.view(*size, 1)
+def nearest_grid_put_2d(H, W, coords, values, return_count=False):
+    # coords: [N, 2], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1], dtype=torch.float32, device=coords.device
+    )
+    indices = indices.round().long()  # [N, 2]
+    result = torch.zeros(H, W, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices, values, weights)
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def linear_grid_put_2d(H, W, coords, values, return_count=False):
+    # coords: [N, 2], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1], dtype=torch.float32, device=coords.device
+    )
+    indices_00 = indices.floor().long()  # [N, 2]
+    indices_00[:, 0].clamp_(0, H - 2)
+    indices_00[:, 1].clamp_(0, W - 2)
+    indices_01 = indices_00 + torch.tensor(
+        [0, 1], dtype=torch.long, device=indices.device
+    )
+    indices_10 = indices_00 + torch.tensor(
+        [1, 0], dtype=torch.long, device=indices.device
+    )
+    indices_11 = indices_00 + torch.tensor(
+        [1, 1], dtype=torch.long, device=indices.device
+    )
+    h = indices[..., 0] - indices_00[..., 0].float()
+    w = indices[..., 1] - indices_00[..., 1].float()
+    w_00 = (1 - h) * (1 - w)
+    w_01 = (1 - h) * w
+    w_10 = h * (1 - w)
+    w_11 = h * w
+    result = torch.zeros(H, W, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices_00, values * w_00.unsqueeze(1), weights* w_00.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_01, values * w_01.unsqueeze(1), weights* w_01.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_10, values * w_10.unsqueeze(1), weights* w_10.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_11, values * w_11.unsqueeze(1), weights* w_11.unsqueeze(1))
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def mipmap_linear_grid_put_2d(H, W, coords, values, min_resolution=32, return_count=False):
+    # coords: [N, 2], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    result = torch.zeros(H, W, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    cur_H, cur_W = H, W
+    while min(cur_H, cur_W) > min_resolution:
+        # try to fill the holes
+        mask = (count.squeeze(-1) == 0)
+        if not mask.any():
+            break
+        cur_result, cur_count = linear_grid_put_2d(cur_H, cur_W, coords, values, return_count=True)
+        result[mask] = result[mask] + F.interpolate(cur_result.permute(2,0,1).unsqueeze(0).contiguous(), (H, W), mode='bilinear', align_corners=False).squeeze(0).permute(1,2,0).contiguous()[mask]
+        count[mask] = count[mask] + F.interpolate(cur_count.view(1, 1, cur_H, cur_W), (H, W), mode='bilinear', align_corners=False).view(H, W, 1)[mask]
+        cur_H //= 2
+        cur_W //= 2
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def nearest_grid_put_3d(H, W, D, coords, values, return_count=False):
+    # coords: [N, 3], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1, D - 1], dtype=torch.float32, device=coords.device
+    )
+    indices = indices.round().long()  # [N, 2]
+    result = torch.zeros(H, W, D, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, D, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices, values, weights)
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def linear_grid_put_3d(H, W, D, coords, values, return_count=False):
+    # coords: [N, 3], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1, D - 1], dtype=torch.float32, device=coords.device
+    )
+    indices_000 = indices.floor().long()  # [N, 3]
+    indices_000[:, 0].clamp_(0, H - 2)
+    indices_000[:, 1].clamp_(0, W - 2)
+    indices_000[:, 2].clamp_(0, D - 2)
+    indices_001 = indices_000 + torch.tensor([0, 0, 1], dtype=torch.long, device=indices.device)
+    indices_010 = indices_000 + torch.tensor([0, 1, 0], dtype=torch.long, device=indices.device)
+    indices_011 = indices_000 + torch.tensor([0, 1, 1], dtype=torch.long, device=indices.device)
+    indices_100 = indices_000 + torch.tensor([1, 0, 0], dtype=torch.long, device=indices.device)
+    indices_101 = indices_000 + torch.tensor([1, 0, 1], dtype=torch.long, device=indices.device)
+    indices_110 = indices_000 + torch.tensor([1, 1, 0], dtype=torch.long, device=indices.device)
+    indices_111 = indices_000 + torch.tensor([1, 1, 1], dtype=torch.long, device=indices.device)
+    h = indices[..., 0] - indices_000[..., 0].float()
+    w = indices[..., 1] - indices_000[..., 1].float()
+    d = indices[..., 2] - indices_000[..., 2].float()
+    w_000 = (1 - h) * (1 - w) * (1 - d)
+    w_001 = (1 - h) * w * (1 - d)
+    w_010 = h * (1 - w) * (1 - d)
+    w_011 = h * w * (1 - d)
+    w_100 = (1 - h) * (1 - w) * d
+    w_101 = (1 - h) * w * d
+    w_110 = h * (1 - w) * d
+    w_111 = h * w * d
+    result = torch.zeros(H, W, D, C, device=values.device, dtype=values.dtype)  # [H, W, D, C]
+    count = torch.zeros(H, W, D, 1, device=values.device, dtype=values.dtype)  # [H, W, D, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices_000, values * w_000.unsqueeze(1), weights * w_000.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_001, values * w_001.unsqueeze(1), weights * w_001.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_010, values * w_010.unsqueeze(1), weights * w_010.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_011, values * w_011.unsqueeze(1), weights * w_011.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_100, values * w_100.unsqueeze(1), weights * w_100.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_101, values * w_101.unsqueeze(1), weights * w_101.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_110, values * w_110.unsqueeze(1), weights * w_110.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_111, values * w_111.unsqueeze(1), weights * w_111.unsqueeze(1))
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def mipmap_linear_grid_put_3d(H, W, D, coords, values, min_resolution=32, return_count=False):
+    # coords: [N, 3], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    result = torch.zeros(H, W, D, C, device=values.device, dtype=values.dtype)  # [H, W, D, C]
+    count = torch.zeros(H, W, D, 1, device=values.device, dtype=values.dtype)  # [H, W, D, 1]
+    cur_H, cur_W, cur_D = H, W, D
+    while min(min(cur_H, cur_W), cur_D) > min_resolution:
+        # try to fill the holes
+        mask = (count.squeeze(-1) == 0)
+        if not mask.any():
+            break
+        cur_result, cur_count = linear_grid_put_3d(cur_H, cur_W, cur_D, coords, values, return_count=True)
+        result[mask] = result[mask] + F.interpolate(cur_result.permute(3,0,1,2).unsqueeze(0).contiguous(), (H, W, D), mode='trilinear', align_corners=False).squeeze(0).permute(1,2,3,0).contiguous()[mask]
+        count[mask] = count[mask] + F.interpolate(cur_count.view(1, 1, cur_H, cur_W, cur_D), (H, W, D), mode='trilinear', align_corners=False).view(H, W, D, 1)[mask]
+        cur_H //= 2
+        cur_W //= 2
+        cur_D //= 2
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def grid_put(shape, coords, values, mode='linear-mipmap', min_resolution=32, return_raw=False):
+    # shape: [D], list/tuple
+    # coords: [N, D], float in [-1, 1]
+    # values: [N, C]
+    D = len(shape)
+    assert D in [2, 3], f'only support D == 2 or 3, but got D == {D}'
+    if mode == 'nearest':
+        if D == 2:
+            return nearest_grid_put_2d(*shape, coords, values, return_raw)
+        else:
+            return nearest_grid_put_3d(*shape, coords, values, return_raw)
+    elif mode == 'linear':
+        if D == 2:
+            return linear_grid_put_2d(*shape, coords, values, return_raw)
+        else:
+            return linear_grid_put_3d(*shape, coords, values, return_raw)
+    elif mode == 'linear-mipmap':
+        if D == 2:
+            return mipmap_linear_grid_put_2d(*shape, coords, values, min_resolution, return_raw)
+        else:
+            return mipmap_linear_grid_put_3d(*shape, coords, values, min_resolution, return_raw)
+    else:
+        raise NotImplementedError(f"got mode {mode}")

sparseags/mesh_utils/mesh.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import os
+import cv2
+import torch
+import trimesh
+import numpy as np
+def dot(x, y):
+    return torch.sum(x * y, -1, keepdim=True)
+def length(x, eps=1e-20):
+    return torch.sqrt(torch.clamp(dot(x, x), min=eps))
+def safe_normalize(x, eps=1e-20):
+    return x / length(x, eps)
+class Mesh:
+    def __init__(
+        self,
+        v=None,
+        f=None,
+        vn=None,
+        fn=None,
+        vt=None,
+        ft=None,
+        albedo=None,
+        vc=None, # vertex color
+        device=None,
+    ):
+        self.device = device
+        self.v = v
+        self.vn = vn
+        self.vt = vt
+        self.f = f
+        self.fn = fn
+        self.ft = ft
+        # only support a single albedo
+        self.albedo = albedo
+        # support vertex color is no albedo
+        self.vc = vc
+        self.ori_center = 0
+        self.ori_scale = 1
+    @classmethod
+    def load(cls, path=None, resize=True, renormal=True, retex=False, front_dir='+z', **kwargs):
+        # assume init with kwargs
+        if path is None:
+            mesh = cls(**kwargs)
+        # obj supports face uv
+        elif path.endswith(".obj"):
+            mesh = cls.load_obj(path, **kwargs)
+        # trimesh only supports vertex uv, but can load more formats
+        else:
+            mesh = cls.load_trimesh(path, **kwargs)
+        print(f"[Mesh loading] v: {mesh.v.shape}, f: {mesh.f.shape}")
+        # auto-normalize
+        if resize:
+            mesh.auto_size()
+        # auto-fix normal
+        if renormal or mesh.vn is None:
+            mesh.auto_normal()
+            print(f"[Mesh loading] vn: {mesh.vn.shape}, fn: {mesh.fn.shape}")
+        # auto-fix texcoords
+        if retex or (mesh.albedo is not None and mesh.vt is None):
+            mesh.auto_uv(cache_path=path)
+            print(f"[Mesh loading] vt: {mesh.vt.shape}, ft: {mesh.ft.shape}")
+        # rotate front dir to +z
+        if front_dir != "+z":
+            # axis switch
+            if "-z" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, -1]], device=mesh.device, dtype=torch.float32)
+            elif "+x" in front_dir:
+                T = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]], device=mesh.device, dtype=torch.float32)
+            elif "-x" in front_dir:
+                T = torch.tensor([[0, 0, -1], [0, 1, 0], [1, 0, 0]], device=mesh.device, dtype=torch.float32)
+            elif "+y" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 0, 1], [0, 1, 0]], device=mesh.device, dtype=torch.float32)
+            elif "-y" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], device=mesh.device, dtype=torch.float32)
+            else:
+                T = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            # rotation (how many 90 degrees)
+            if '1' in front_dir:
+                T @= torch.tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            elif '2' in front_dir:
+                T @= torch.tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            elif '3' in front_dir:
+                T @= torch.tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            mesh.v @= T
+            mesh.vn @= T
+        return mesh
+    # load from obj file
+    @classmethod
+    def load_obj(cls, path, albedo_path=None, device=None, enable_dino=False):
+        assert os.path.splitext(path)[-1] == ".obj"
+        mesh = cls()
+        # device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mesh.device = device
+        # load obj
+        with open(path, "r") as f:
+            lines = f.readlines()
+        def parse_f_v(fv):
+            # pass in a vertex term of a face, return {v, vt, vn} (-1 if not provided)
+            # supported forms:
+            # f v1 v2 v3
+            # f v1/vt1 v2/vt2 v3/vt3
+            # f v1/vt1/vn1 v2/vt2/vn2 v3/vt3/vn3
+            # f v1//vn1 v2//vn2 v3//vn3
+            xs = [int(x) - 1 if x != "" else -1 for x in fv.split("/")]
+            xs.extend([-1] * (3 - len(xs)))
+            return xs[0], xs[1], xs[2]
+        # NOTE: we ignore usemtl, and assume the mesh ONLY uses one material (first in mtl)
+        vertices, texcoords, normals = [], [], []
+        faces, tfaces, nfaces = [], [], []
+        mtl_path = None
+        for line in lines:
+            split_line = line.split()
+            # empty line
+            if len(split_line) == 0:
+                continue
+            prefix = split_line[0].lower()
+            # mtllib
+            if prefix == "mtllib":
+                mtl_path = split_line[1]
+            # usemtl
+            elif prefix == "usemtl":
+                pass # ignored
+            # v/vn/vt
+            elif prefix == "v":
+                vertices.append([float(v) for v in split_line[1:]])
+            elif prefix == "vn":
+                normals.append([float(v) for v in split_line[1:]])
+            elif prefix == "vt":
+                val = [float(v) for v in split_line[1:]]
+                texcoords.append([val[0], 1.0 - val[1]])
+            elif prefix == "f":
+                vs = split_line[1:]
+                nv = len(vs)
+                v0, t0, n0 = parse_f_v(vs[0])
+                for i in range(nv - 2):  # triangulate (assume vertices are ordered)
+                    v1, t1, n1 = parse_f_v(vs[i + 1])
+                    v2, t2, n2 = parse_f_v(vs[i + 2])
+                    faces.append([v0, v1, v2])
+                    tfaces.append([t0, t1, t2])
+                    nfaces.append([n0, n1, n2])
+        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
+        mesh.vt = (
+            torch.tensor(texcoords, dtype=torch.float32, device=device)
+            if len(texcoords) > 0
+            else None
+        )
+        mesh.vn = (
+            torch.tensor(normals, dtype=torch.float32, device=device)
+            if len(normals) > 0
+            else None
+        )
+        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
+        mesh.ft = (
+            torch.tensor(tfaces, dtype=torch.int32, device=device)
+            if len(texcoords) > 0
+            else None
+        )
+        mesh.fn = (
+            torch.tensor(nfaces, dtype=torch.int32, device=device)
+            if len(normals) > 0
+            else None
+        )
+        # see if there is vertex color
+        use_vertex_color = False
+        if mesh.v.shape[1] == 6:
+            use_vertex_color = True
+            mesh.vc = mesh.v[:, 3:]
+            mesh.v = mesh.v[:, :3]
+            print(f"[load_obj] use vertex color: {mesh.vc.shape}")
+        # try to load texture image
+        if not use_vertex_color:
+            # try to retrieve mtl file
+            mtl_path_candidates = []
+            if mtl_path is not None:
+                mtl_path_candidates.append(mtl_path)
+                mtl_path_candidates.append(os.path.join(os.path.dirname(path), mtl_path))
+            mtl_path_candidates.append(path.replace(".obj", ".mtl"))
+            mtl_path = None
+            for candidate in mtl_path_candidates:
+                if os.path.exists(candidate):
+                    mtl_path = candidate
+                    break
+            # if albedo_path is not provided, try retrieve it from mtl
+            if mtl_path is not None and albedo_path is None:
+                with open(mtl_path, "r") as f:
+                    lines = f.readlines()
+                for line in lines:
+                    split_line = line.split()
+                    # empty line
+                    if len(split_line) == 0:
+                        continue
+                    prefix = split_line[0]
+                    # NOTE: simply use the first map_Kd as albedo!
+                    if "map_Kd" in prefix:
+                        albedo_path = os.path.join(os.path.dirname(path), split_line[1])
+                        print(f"[load_obj] use texture from: {albedo_path}")
+                        # break
+                    if "map_Ft" in prefix:
+                        feature_path = os.path.join(os.path.dirname(path), split_line[1])
+                        print(f"[load_obj] use feature from: {feature_path}")
+                        break
+            # still not found albedo_path, or the path doesn't exist
+            if albedo_path is None or not os.path.exists(albedo_path):
+                # init an empty texture
+                print(f"[load_obj] init empty albedo!")
+                # albedo = np.random.rand(1024, 1024, 3).astype(np.float32)
+                albedo = np.ones((1024, 1024, 3), dtype=np.float32) * np.array([0.5, 0.5, 0.5])  # default color
+            else:
+                albedo = cv2.imread(albedo_path, cv2.IMREAD_UNCHANGED)
+                albedo = cv2.cvtColor(albedo, cv2.COLOR_BGR2RGB)
+                albedo = albedo.astype(np.float32) / 255
+                print(f"[load_obj] load texture: {albedo.shape}")
+                # import matplotlib.pyplot as plt
+                # plt.imshow(albedo)
+                # plt.show()
+            if enable_dino and os.path.exists(feature_path):
+                feature = torch.load(feature_path).to(device)
+                mesh.feature = feature
+                print(f"[load_obj] load feature: {feature.shape}")
+            mesh.albedo = torch.tensor(albedo, dtype=torch.float32, device=device)
+        return mesh
+    @classmethod
+    def load_trimesh(cls, path, device=None, enable_dino=False):
+        mesh = cls()
+        # device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mesh.device = device
+        # use trimesh to load ply/glb, assume only has one single RootMesh...
+        _data = trimesh.load(path)
+        if isinstance(_data, trimesh.Scene):
+            if len(_data.geometry) == 1:
+                _mesh = list(_data.geometry.values())[0]
+            else:
+                # manual concat, will lose texture
+                _concat = []
+                for g in _data.geometry.values():
+                    if isinstance(g, trimesh.Trimesh):
+                        _concat.append(g)
+                _mesh = trimesh.util.concatenate(_concat)
+        else:
+            _mesh = _data
+        if _mesh.visual.kind == 'vertex':
+            vertex_colors = _mesh.visual.vertex_colors
+            vertex_colors = np.array(vertex_colors[..., :3]).astype(np.float32) / 255
+            mesh.vc = torch.tensor(vertex_colors, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] use vertex color: {mesh.vc.shape}")
+        elif _mesh.visual.kind == 'texture':
+            _material = _mesh.visual.material
+            if isinstance(_material, trimesh.visual.material.PBRMaterial):
+                texture = np.array(_material.baseColorTexture).astype(np.float32) / 255
+            elif isinstance(_material, trimesh.visual.material.SimpleMaterial):
+                texture = np.array(_material.to_pbr().baseColorTexture).astype(np.float32) / 255
+            else:
+                raise NotImplementedError(f"material type {type(_material)} not supported!")
+            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] load texture: {texture.shape}")
+        else:
+            texture = np.ones((1024, 1024, 3), dtype=np.float32) * np.array([0.5, 0.5, 0.5])
+            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] failed to load texture.")
+        vertices = _mesh.vertices
+        try:
+            texcoords = _mesh.visual.uv
+            texcoords[:, 1] = 1 - texcoords[:, 1]
+        except Exception as e:
+            texcoords = None
+        try:
+            normals = _mesh.vertex_normals
+        except Exception as e:
+            normals = None
+        # trimesh only support vertex uv...
+        faces = tfaces = nfaces = _mesh.faces
+        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
+        mesh.vt = (
+            torch.tensor(texcoords, dtype=torch.float32, device=device)
+            if texcoords is not None
+            else None
+        )
+        mesh.vn = (
+            torch.tensor(normals, dtype=torch.float32, device=device)
+            if normals is not None
+            else None
+        )
+        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
+        mesh.ft = (
+            torch.tensor(tfaces, dtype=torch.int32, device=device)
+            if texcoords is not None
+            else None
+        )
+        mesh.fn = (
+            torch.tensor(nfaces, dtype=torch.int32, device=device)
+            if normals is not None
+            else None
+        )
+        return mesh
+    # aabb
+    def aabb(self):
+        return torch.min(self.v, dim=0).values, torch.max(self.v, dim=0).values
+    # unit size
+    @torch.no_grad()
+    def auto_size(self):
+        vmin, vmax = self.aabb()
+        self.ori_center = (vmax + vmin) / 2
+        self.ori_scale = 1.2 / torch.max(vmax - vmin).item()
+        self.v = (self.v - self.ori_center) * self.ori_scale
+    def auto_normal(self):
+        i0, i1, i2 = self.f[:, 0].long(), self.f[:, 1].long(), self.f[:, 2].long()
+        v0, v1, v2 = self.v[i0, :], self.v[i1, :], self.v[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0)
+        # Splat face normals to vertices
+        vn = torch.zeros_like(self.v)
+        vn.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+        vn.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+        vn.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+        # Normalize, replace zero (degenerated) normals with some default value
+        vn = torch.where(
+            dot(vn, vn) > 1e-20,
+            vn,
+            torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device),
+        )
+        vn = safe_normalize(vn)
+        self.vn = vn
+        self.fn = self.f
+    def auto_uv(self, cache_path=None, vmap=True):
+        # try to load cache
+        if cache_path is not None:
+            cache_path = os.path.splitext(cache_path)[0] + "_uv.npz"
+        if cache_path is not None and os.path.exists(cache_path):
+            data = np.load(cache_path)
+            vt_np, ft_np, vmapping = data["vt"], data["ft"], data["vmapping"]
+        else:
+            import xatlas
+            v_np = self.v.detach().cpu().numpy()
+            f_np = self.f.detach().int().cpu().numpy()
+            atlas = xatlas.Atlas()
+            atlas.add_mesh(v_np, f_np)
+            chart_options = xatlas.ChartOptions()
+            # chart_options.max_iterations = 4
+            atlas.generate(chart_options=chart_options)
+            vmapping, ft_np, vt_np = atlas[0]  # [N], [M, 3], [N, 2]
+            # save to cache
+            if cache_path is not None:
+                np.savez(cache_path, vt=vt_np, ft=ft_np, vmapping=vmapping)
+        vt = torch.from_numpy(vt_np.astype(np.float32)).to(self.device)
+        ft = torch.from_numpy(ft_np.astype(np.int32)).to(self.device)
+        self.vt = vt
+        self.ft = ft
+        if vmap:
+            # remap v/f to vt/ft, so each v correspond to a unique vt. (necessary for gltf)
+            vmapping = torch.from_numpy(vmapping.astype(np.int64)).long().to(self.device)
+            self.align_v_to_vt(vmapping)
+    def align_v_to_vt(self, vmapping=None):
+        # remap v/f and vn/vn to vt/ft.
+        if vmapping is None:
+            ft = self.ft.view(-1).long()
+            f = self.f.view(-1).long()
+            vmapping = torch.zeros(self.vt.shape[0], dtype=torch.long, device=self.device)
+            vmapping[ft] = f # scatter, randomly choose one if index is not unique
+        self.v = self.v[vmapping]
+        self.f = self.ft
+        # assume fn == f
+        if self.vn is not None:
+            self.vn = self.vn[vmapping]
+            self.fn = self.ft
+    def to(self, device):
+        self.device = device
+        for name in ["v", "f", "vn", "fn", "vt", "ft", "albedo"]:
+            tensor = getattr(self, name)
+            if tensor is not None:
+                setattr(self, name, tensor.to(device))
+        return self
+    def write(self, path, enable_dino=False):
+        if path.endswith(".ply"):
+            self.write_ply(path)
+        elif path.endswith(".obj"):
+            self.write_obj(path, enable_dino)
+        elif path.endswith(".glb") or path.endswith(".gltf"):
+            self.write_glb(path)
+        else:
+            raise NotImplementedError(f"format {path} not supported!")
+    # write to ply file (only geom)
+    def write_ply(self, path):
+        v_np = self.v.detach().cpu().numpy()
+        f_np = self.f.detach().cpu().numpy()
+        _mesh = trimesh.Trimesh(vertices=v_np, faces=f_np)
+        _mesh.export(path)
+    # write to gltf/glb file (geom + texture)
+    def write_glb(self, path):
+        assert self.vn is not None and self.vt is not None # should be improved to support export without texture...
+        # assert self.v.shape[0] == self.vn.shape[0] and self.v.shape[0] == self.vt.shape[0]
+        if self.v.shape[0] != self.vt.shape[0]:
+            self.align_v_to_vt()
+        # assume f == fn == ft
+        import pygltflib
+        f_np = self.f.detach().cpu().numpy().astype(np.uint32)
+        v_np = self.v.detach().cpu().numpy().astype(np.float32)
+        # vn_np = self.vn.detach().cpu().numpy().astype(np.float32)
+        vt_np = self.vt.detach().cpu().numpy().astype(np.float32)
+        albedo = self.albedo.detach().cpu().numpy()
+        albedo = (albedo * 255).astype(np.uint8)
+        albedo = cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR)
+        f_np_blob = f_np.flatten().tobytes()
+        v_np_blob = v_np.tobytes()
+        # vn_np_blob = vn_np.tobytes()
+        vt_np_blob = vt_np.tobytes()
+        albedo_blob = cv2.imencode('.png', albedo)[1].tobytes()
+        gltf = pygltflib.GLTF2(
+            scene=0,
+            scenes=[pygltflib.Scene(nodes=[0])],
+            nodes=[pygltflib.Node(mesh=0)],
+            meshes=[pygltflib.Mesh(primitives=[
+                pygltflib.Primitive(
+                    # indices to accessors (0 is triangles)
+                    attributes=pygltflib.Attributes(
+                        POSITION=1, TEXCOORD_0=2,
+                    ),
+                    indices=0, material=0,
+                )
+            ])],
+            materials=[
+                pygltflib.Material(
+                    pbrMetallicRoughness=pygltflib.PbrMetallicRoughness(
+                        baseColorTexture=pygltflib.TextureInfo(index=0, texCoord=0),
+                        metallicFactor=0.0,
+                        roughnessFactor=1.0,
+                    ),
+                    alphaCutoff=0,
+                    doubleSided=True,
+                )
+            ],
+            textures=[
+                pygltflib.Texture(sampler=0, source=0),
+            ],
+            samplers=[
+                pygltflib.Sampler(magFilter=pygltflib.LINEAR, minFilter=pygltflib.LINEAR_MIPMAP_LINEAR, wrapS=pygltflib.REPEAT, wrapT=pygltflib.REPEAT),
+            ],
+            images=[
+                # use embedded (buffer) image
+                pygltflib.Image(bufferView=3, mimeType="image/png"),
+            ],
+            buffers=[
+                pygltflib.Buffer(byteLength=len(f_np_blob) + len(v_np_blob) + len(vt_np_blob) + len(albedo_blob))
+            ],
+            # buffer view (based on dtype)
+            bufferViews=[
+                # triangles; as flatten (element) array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteLength=len(f_np_blob),
+                    target=pygltflib.ELEMENT_ARRAY_BUFFER, # GL_ELEMENT_ARRAY_BUFFER (34963)
+                ),
+                # positions; as vec3 array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob),
+                    byteLength=len(v_np_blob),
+                    byteStride=12, # vec3
+                    target=pygltflib.ARRAY_BUFFER, # GL_ARRAY_BUFFER (34962)
+                ),
+                # texcoords; as vec2 array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob) + len(v_np_blob),
+                    byteLength=len(vt_np_blob),
+                    byteStride=8, # vec2
+                    target=pygltflib.ARRAY_BUFFER,
+                ),
+                # texture; as none target
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob) + len(v_np_blob) + len(vt_np_blob),
+                    byteLength=len(albedo_blob),
+                ),
+            ],
+            accessors=[
+                # 0 = triangles
+                pygltflib.Accessor(
+                    bufferView=0,
+                    componentType=pygltflib.UNSIGNED_INT, # GL_UNSIGNED_INT (5125)
+                    count=f_np.size,
+                    type=pygltflib.SCALAR,
+                    max=[int(f_np.max())],
+                    min=[int(f_np.min())],
+                ),
+                # 1 = positions
+                pygltflib.Accessor(
+                    bufferView=1,
+                    componentType=pygltflib.FLOAT, # GL_FLOAT (5126)
+                    count=len(v_np),
+                    type=pygltflib.VEC3,
+                    max=v_np.max(axis=0).tolist(),
+                    min=v_np.min(axis=0).tolist(),
+                ),
+                # 2 = texcoords
+                pygltflib.Accessor(
+                    bufferView=2,
+                    componentType=pygltflib.FLOAT,
+                    count=len(vt_np),
+                    type=pygltflib.VEC2,
+                    max=vt_np.max(axis=0).tolist(),
+                    min=vt_np.min(axis=0).tolist(),
+                ),
+            ],
+        )
+        # set actual data
+        gltf.set_binary_blob(f_np_blob + v_np_blob + vt_np_blob + albedo_blob)
+        # glb = b"".join(gltf.save_to_bytes())
+        gltf.save(path)
+    # write to obj file (geom + texture)
+    def write_obj(self, path, enable_dino=False):
+        mtl_path = path.replace(".obj", ".mtl")
+        albedo_path = path.replace(".obj", "_albedo.png")
+        feature_path = path.replace(".obj", "_feature.pt")
+        v_np = self.v.detach().cpu().numpy()
+        vt_np = self.vt.detach().cpu().numpy() if self.vt is not None else None
+        vn_np = self.vn.detach().cpu().numpy() if self.vn is not None else None
+        f_np = self.f.detach().cpu().numpy()
+        ft_np = self.ft.detach().cpu().numpy() if self.ft is not None else None
+        fn_np = self.fn.detach().cpu().numpy() if self.fn is not None else None
+        with open(path, "w") as fp:
+            fp.write(f"mtllib {os.path.basename(mtl_path)} \n")
+            for v in v_np:
+                fp.write(f"v {v[0]} {v[1]} {v[2]} \n")
+            if vt_np is not None:
+                for v in vt_np:
+                    fp.write(f"vt {v[0]} {1 - v[1]} \n")
+            if vn_np is not None:
+                for v in vn_np:
+                    fp.write(f"vn {v[0]} {v[1]} {v[2]} \n")
+            fp.write(f"usemtl defaultMat \n")
+            for i in range(len(f_np)):
+                fp.write(
+                    f'f {f_np[i, 0] + 1}/{ft_np[i, 0] + 1 if ft_np is not None else ""}/{fn_np[i, 0] + 1 if fn_np is not None else ""} \
+                             {f_np[i, 1] + 1}/{ft_np[i, 1] + 1 if ft_np is not None else ""}/{fn_np[i, 1] + 1 if fn_np is not None else ""} \
+                             {f_np[i, 2] + 1}/{ft_np[i, 2] + 1 if ft_np is not None else ""}/{fn_np[i, 2] + 1 if fn_np is not None else ""} \n'
+                )
+        with open(mtl_path, "w") as fp:
+            fp.write(f"newmtl defaultMat \n")
+            fp.write(f"Ka 1 1 1 \n")
+            fp.write(f"Kd 1 1 1 \n")
+            fp.write(f"Ks 0 0 0 \n")
+            fp.write(f"Tr 1 \n")
+            fp.write(f"illum 1 \n")
+            fp.write(f"Ns 0 \n")
+            fp.write(f"map_Kd {os.path.basename(albedo_path)} \n")
+            if enable_dino:
+                fp.write(f"map_Ft {os.path.basename(feature_path)} \n")
+        albedo = self.albedo.detach().cpu().numpy()
+        albedo = (albedo * 255).astype(np.uint8)
+        cv2.imwrite(albedo_path, cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR))
+        if enable_dino:
+            feature = self.feature.detach().cpu()
+            torch.save(feature, feature_path)

sparseags/mesh_utils/mesh_renderer.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import os
+import math
+import cv2
+import trimesh
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import nvdiffrast.torch as dr
+from sparseags.mesh_utils.mesh import Mesh, safe_normalize
+def scale_img_nhwc(x, size, mag='bilinear', min='bilinear'):
+    assert (x.shape[1] >= size[0] and x.shape[2] >= size[1]) or (x.shape[1] < size[0] and x.shape[2] < size[1]), "Trying to magnify image in one dimension and minify in the other"
+    y = x.permute(0, 3, 1, 2) # NHWC -> NCHW
+    if x.shape[1] > size[0] and x.shape[2] > size[1]: # Minification, previous size was bigger
+        y = torch.nn.functional.interpolate(y, size, mode=min)
+    else: # Magnification
+        if mag == 'bilinear' or mag == 'bicubic':
+            y = torch.nn.functional.interpolate(y, size, mode=mag, align_corners=True)
+        else:
+            y = torch.nn.functional.interpolate(y, size, mode=mag)
+    return y.permute(0, 2, 3, 1).contiguous() # NCHW -> NHWC
+def scale_img_hwc(x, size, mag='bilinear', min='bilinear'):
+    return scale_img_nhwc(x[None, ...], size, mag, min)[0]
+def scale_img_nhw(x, size, mag='bilinear', min='bilinear'):
+    return scale_img_nhwc(x[..., None], size, mag, min)[..., 0]
+def scale_img_hw(x, size, mag='bilinear', min='bilinear'):
+    return scale_img_nhwc(x[None, ..., None], size, mag, min)[0, ..., 0]
+def trunc_rev_sigmoid(x, eps=1e-6):
+    x = x.clamp(eps, 1 - eps)
+    return torch.log(x / (1 - x))
+def make_divisible(x, m=8):
+    return int(math.ceil(x / m) * m)
+class Renderer(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+        self.opt = opt
+        self.enable_dino = self.opt.lambda_dino > 0
+        self.mesh = Mesh.load(self.opt.mesh, resize=False, enable_dino=self.enable_dino)
+        if not self.opt.force_cuda_rast and (not self.opt.gui or os.name == 'nt'):
+            self.glctx = dr.RasterizeGLContext()
+        else:
+            self.glctx = dr.RasterizeCudaContext()
+        self.v_offsets = torch.zeros_like(self.mesh.v)
+        self.raw_albedo = trunc_rev_sigmoid(self.mesh.albedo)
+        # extract trainable parameters
+        if opt.trainable_texture:
+            self.v_offsets = nn.Parameter(self.v_offsets)
+            self.raw_albedo = nn.Parameter(self.raw_albedo)
+        if self.enable_dino:
+            self.raw_feature = nn.Parameter((self.mesh.feature))
+    def get_params(self):
+        params = [
+            {'params': self.raw_albedo, 'lr': self.opt.texture_lr},
+        ]
+        if self.enable_dino:
+            params.append({'params': self.raw_feature, 'lr': self.opt.texture_lr})
+        if self.opt.train_geo:
+            params.append({'params': self.v_offsets, 'lr': self.opt.geom_lr})
+        return params
+    @torch.no_grad()
+    def export_mesh(self, save_path):
+        self.mesh.v = (self.mesh.v + self.v_offsets).detach()
+        self.mesh.albedo = torch.sigmoid(self.raw_albedo.detach())
+        if self.enable_dino:
+            self.mesh.feature = self.raw_feature.detach()
+        self.mesh.write(save_path, self.enable_dino)
+    def render(self, pose, proj, h0, w0, ssaa=1, bg_color=1, texture_filter='linear-mipmap-linear'):
+        # do super-sampling
+        if ssaa != 1:
+            h = make_divisible(h0 * ssaa, 8)
+            w = make_divisible(w0 * ssaa, 8)
+        else:
+            h, w = h0, w0
+        results = {}
+        # get v
+        if self.opt.train_geo:
+            v = self.mesh.v + self.v_offsets # [N, 3]
+        else:
+            v = self.mesh.v
+        pose = torch.from_numpy(pose.astype(np.float32)).to(v.device)
+        proj = torch.from_numpy(proj.astype(np.float32)).to(v.device)
+        # get v_clip and render rgb
+        v_cam = torch.matmul(F.pad(v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
+        v_clip = v_cam @ proj.T
+        rast, rast_db = dr.rasterize(self.glctx, v_clip, self.mesh.f, (h, w))
+        alpha = (rast[0, ..., 3:] > 0).float()
+        depth, _ = dr.interpolate(-v_cam[..., [2]], rast, self.mesh.f) # [1, H, W, 1]
+        depth = depth.squeeze(0) # [H, W, 1]
+        texc, texc_db = dr.interpolate(self.mesh.vt.unsqueeze(0).contiguous(), rast, self.mesh.ft, rast_db=rast_db, diff_attrs='all')
+        albedo = dr.texture(self.raw_albedo.unsqueeze(0), texc, uv_da=texc_db, filter_mode=texture_filter) # [1, H, W, 3]
+        albedo = torch.sigmoid(albedo)
+        if self.enable_dino:
+            # NOTE: backward error when use filter_mode='linear-mipmap-linear'
+            feature = dr.texture(self.raw_feature.unsqueeze(0), texc, uv_da=texc_db, filter_mode='linear')
+        #     feature = torch.sigmoid(feature)
+        # get vn and render normal
+        if self.opt.train_geo:
+            i0, i1, i2 = self.mesh.f[:, 0].long(), self.mesh.f[:, 1].long(), self.mesh.f[:, 2].long()
+            v0, v1, v2 = v[i0, :], v[i1, :], v[i2, :]
+            face_normals = torch.cross(v1 - v0, v2 - v0)
+            face_normals = safe_normalize(face_normals)
+            vn = torch.zeros_like(v)
+            vn.scatter_add_(0, i0[:, None].repeat(1,3), face_normals)
+            vn.scatter_add_(0, i1[:, None].repeat(1,3), face_normals)
+            vn.scatter_add_(0, i2[:, None].repeat(1,3), face_normals)
+            vn = torch.where(torch.sum(vn * vn, -1, keepdim=True) > 1e-20, vn, torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device))
+        else:
+            vn = self.mesh.vn
+        normal, _ = dr.interpolate(vn.unsqueeze(0).contiguous(), rast, self.mesh.fn)
+        normal = safe_normalize(normal[0])
+        # rotated normal (where [0, 0, 1] always faces camera)
+        rot_normal = normal @ pose[:3, :3]
+        viewcos = rot_normal[..., [2]]
+        # antialias
+        albedo = dr.antialias(albedo, rast, v_clip, self.mesh.f).squeeze(0) # [H, W, 3]
+        albedo = alpha * albedo + (1 - alpha) * bg_color
+        if self.enable_dino:
+            feature = dr.antialias(feature, rast, v_clip, self.mesh.f).squeeze(0) # [H, W, 3]
+            feature = alpha * feature + (1 - alpha) * bg_color
+        # ssaa
+        if ssaa != 1:
+            albedo = scale_img_hwc(albedo, (h0, w0))
+            alpha = scale_img_hwc(alpha, (h0, w0))
+            depth = scale_img_hwc(depth, (h0, w0))
+            normal = scale_img_hwc(normal, (h0, w0))
+            viewcos = scale_img_hwc(viewcos, (h0, w0))
+            if self.enable_dino:
+                feature = scale_img_hwc(feature, (h0, w0))
+        results['image'] = albedo.clamp(0, 1)
+        results['alpha'] = alpha
+        results['depth'] = depth
+        results['normal'] = (normal + 1) / 2
+        results['viewcos'] = viewcos
+        results['feature'] = feature if self.enable_dino else None # [H, W, 384]
+        return results
+    def render_batch(self, pose, proj, h0, w0, ssaa=1, bg_color=1, texture_filter='linear-mipmap-linear'):
+        # do super-sampling
+        if ssaa != 1:
+            h = make_divisible(h0 * ssaa, 8)
+            w = make_divisible(w0 * ssaa, 8)
+        else:
+            h, w = h0, w0
+        results = {}
+        # get v
+        if self.opt.train_geo:
+            v = self.mesh.v + self.v_offsets # [N, 3]
+        else:
+            v = self.mesh.v
+        bs = pose.shape[0]
+        pose = pose.to(v.device)
+        proj = proj.to(v.device).transpose(1, 2)
+        # get v_clip and render rgb
+        v_cam = torch.bmm(F.pad(v, pad=(0, 1), mode='constant', value=1.0).expand(bs, -1, -1), torch.linalg.inv(pose).transpose(1, 2)).float()
+        v_clip = torch.bmm(v_cam, proj)
+        rast, rast_db = dr.rasterize(self.glctx, v_clip, self.mesh.f, (h, w))
+        alpha = (rast[..., 3:] > 0).float()
+        depth, _ = dr.interpolate(-v_cam[..., [2]], rast, self.mesh.f) # [1, H, W, 1]
+        texc, texc_db = dr.interpolate(self.mesh.vt.expand(bs, -1, -1).contiguous(), rast, self.mesh.ft, rast_db=rast_db, diff_attrs='all')
+        albedo = dr.texture(self.raw_albedo.detach().unsqueeze(0).contiguous(), texc, uv_da=texc_db, filter_mode=texture_filter) # [1, H, W, 3]
+        albedo = torch.sigmoid(albedo)
+        if self.enable_dino:
+            # NOTE: backward error when use filter_mode='linear-mipmap-linear'
+            feature = dr.texture(self.raw_feature.unsqueeze(0), texc, uv_da=texc_db, filter_mode='linear')
+        #     feature = torch.sigmoid(feature)
+        # get vn and render normal
+        if self.opt.train_geo:
+            i0, i1, i2 = self.mesh.f[:, 0].long(), self.mesh.f[:, 1].long(), self.mesh.f[:, 2].long()
+            v0, v1, v2 = v[i0, :], v[i1, :], v[i2, :]
+            face_normals = torch.cross(v1 - v0, v2 - v0)
+            face_normals = safe_normalize(face_normals)
+            vn = torch.zeros_like(v)
+            vn.scatter_add_(0, i0[:, None].repeat(1,3), face_normals)
+            vn.scatter_add_(0, i1[:, None].repeat(1,3), face_normals)
+            vn.scatter_add_(0, i2[:, None].repeat(1,3), face_normals)
+            vn = torch.where(torch.sum(vn * vn, -1, keepdim=True) > 1e-20, vn, torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device))
+        else:
+            vn = self.mesh.vn
+        normal, _ = dr.interpolate(vn.expand(bs, -1, -1).contiguous(), rast, self.mesh.fn)
+        normal = safe_normalize(normal).reshape(bs, -1, 3)
+        # rotated normal (where [0, 0, 1] always faces camera)
+        rot_normal = torch.bmm(normal, pose[:, :3, :3]).reshape(bs, h, w, 3)
+        viewcos = rot_normal[..., [2]]
+        # antialias
+        albedo = dr.antialias(albedo, rast, v_clip, self.mesh.f) # [H, W, 3]
+        albedo = alpha * albedo + (1 - alpha) * bg_color
+        if self.enable_dino:
+            feature = dr.antialias(feature, rast, v_clip, self.mesh.f).squeeze(0) # [H, W, 3]
+            feature = alpha * feature + (1 - alpha) * bg_color
+        # ssaa
+        if ssaa != 1:
+            albedo = scale_img_hwc(albedo, (h0, w0))
+            alpha = scale_img_hwc(alpha, (h0, w0))
+            depth = scale_img_hwc(depth, (h0, w0))
+            normal = scale_img_hwc(normal, (h0, w0))
+            viewcos = scale_img_hwc(viewcos, (h0, w0))
+            if self.enable_dino:
+                feature = scale_img_hwc(feature, (h0, w0))
+        results['image'] = albedo.clamp(0, 1)
+        results['alpha'] = alpha
+        results['depth'] = depth
+        results['normal'] = (normal + 1) / 2
+        results['viewcos'] = viewcos
+        results['feature'] = feature if self.enable_dino else None # [H, W, 384]
+        return results

sparseags/mesh_utils/mesh_utils.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import numpy as np
+import pymeshlab as pml
+def poisson_mesh_reconstruction(points, normals=None):
+    # points/normals: [N, 3] np.ndarray
+    import open3d as o3d
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    # outlier removal
+    pcd, ind = pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=10)
+    # normals
+    if normals is None:
+        pcd.estimate_normals()
+    else:
+        pcd.normals = o3d.utility.Vector3dVector(normals[ind])
+    # visualize
+    o3d.visualization.draw_geometries([pcd], point_show_normal=False)
+    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
+        pcd, depth=9
+    )
+    vertices_to_remove = densities < np.quantile(densities, 0.1)
+    mesh.remove_vertices_by_mask(vertices_to_remove)
+    # visualize
+    o3d.visualization.draw_geometries([mesh])
+    vertices = np.asarray(mesh.vertices)
+    triangles = np.asarray(mesh.triangles)
+    print(
+        f"[INFO] poisson mesh reconstruction: {points.shape} --> {vertices.shape} / {triangles.shape}"
+    )
+    return vertices, triangles
+def decimate_mesh(
+    verts, faces, target, backend="pymeshlab", remesh=False, optimalplacement=True
+):
+    # optimalplacement: default is True, but for flat mesh must turn False to prevent spike artifect.
+    _ori_vert_shape = verts.shape
+    _ori_face_shape = faces.shape
+    if backend == "pyfqmr":
+        import pyfqmr
+        solver = pyfqmr.Simplify()
+        solver.setMesh(verts, faces)
+        solver.simplify_mesh(target_count=target, preserve_border=False, verbose=False)
+        verts, faces, normals = solver.getMesh()
+    else:
+        m = pml.Mesh(verts, faces)
+        ms = pml.MeshSet()
+        ms.add_mesh(m, "mesh")  # will copy!
+        # filters
+        # ms.meshing_decimation_clustering(threshold=pml.Percentage(1))
+        ms.meshing_decimation_quadric_edge_collapse(
+            targetfacenum=int(target), optimalplacement=optimalplacement
+        )
+        if remesh:
+            # ms.apply_coord_taubin_smoothing()
+            ms.meshing_isotropic_explicit_remeshing(
+                iterations=3, targetlen=pml.Percentage(1)
+            )
+        # extract mesh
+        m = ms.current_mesh()
+        verts = m.vertex_matrix()
+        faces = m.face_matrix()
+    print(
+        f"[INFO] mesh decimation: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
+    )
+    return verts, faces
+def clean_mesh(
+    verts,
+    faces,
+    v_pct=1,
+    min_f=64,
+    min_d=20,
+    repair=True,
+    remesh=True,
+    remesh_size=0.01,
+):
+    # verts: [N, 3]
+    # faces: [N, 3]
+    _ori_vert_shape = verts.shape
+    _ori_face_shape = faces.shape
+    m = pml.Mesh(verts, faces)
+    ms = pml.MeshSet()
+    ms.add_mesh(m, "mesh")  # will copy!
+    # filters
+    ms.meshing_remove_unreferenced_vertices()  # verts not refed by any faces
+    if v_pct > 0:
+        ms.meshing_merge_close_vertices(
+            threshold=pml.Percentage(v_pct)
+        )  # 1/10000 of bounding box diagonal
+    ms.meshing_remove_duplicate_faces()  # faces defined by the same verts
+    ms.meshing_remove_null_faces()  # faces with area == 0
+    if min_d > 0:
+        ms.meshing_remove_connected_component_by_diameter(
+            mincomponentdiag=pml.Percentage(min_d)
+        )
+    if min_f > 0:
+        ms.meshing_remove_connected_component_by_face_number(mincomponentsize=min_f)
+    if repair:
+        # ms.meshing_remove_t_vertices(method=0, threshold=40, repeat=True)
+        ms.meshing_repair_non_manifold_edges(method=0)
+        ms.meshing_repair_non_manifold_vertices(vertdispratio=0)
+    if remesh:
+        # ms.apply_coord_taubin_smoothing()
+        ms.meshing_isotropic_explicit_remeshing(
+            iterations=3, targetlen=pml.AbsoluteValue(remesh_size)
+        )
+    # extract mesh
+    m = ms.current_mesh()
+    verts = m.vertex_matrix()
+    faces = m.face_matrix()
+    print(
+        f"[INFO] mesh cleaning: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
+    )
+    return verts, faces

sparseags/render_utils/gs_renderer.py ADDED Viewed

	@@ -0,0 +1,1102 @@

+import os
+import math
+import numpy as np
+from typing import NamedTuple
+from plyfile import PlyData, PlyElement
+import torch
+from torch import nn
+from liegroups.torch import SE3
+from simple_knn._C import distCUDA2
+from sparseags.sh_utils import eval_sh, SH2RGB, RGB2SH
+from sparseags.mesh_utils.mesh import Mesh
+from sparseags.mesh_utils.mesh_utils import decimate_mesh, clean_mesh
+from sparseags.cam_utils import sample_points_from_voxel
+import kiui
+def inverse_sigmoid(x):
+    return torch.log(x/(1-x))
+def get_expon_lr_func(
+    lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000
+):
+    def helper(step):
+        if lr_init == lr_final:
+            # constant lr, ignore other params
+            return lr_init
+        if step < 0 or (lr_init == 0.0 and lr_final == 0.0):
+            # Disable this parameter
+            return 0.0
+        if lr_delay_steps > 0:
+            # A kind of reverse cosine decay.
+            delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
+                0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1)
+            )
+        else:
+            delay_rate = 1.0
+        t = np.clip(step / max_steps, 0, 1)
+        log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
+        return delay_rate * log_lerp
+    return helper
+def strip_lowerdiag(L):
+    uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")
+    uncertainty[:, 0] = L[:, 0, 0]
+    uncertainty[:, 1] = L[:, 0, 1]
+    uncertainty[:, 2] = L[:, 0, 2]
+    uncertainty[:, 3] = L[:, 1, 1]
+    uncertainty[:, 4] = L[:, 1, 2]
+    uncertainty[:, 5] = L[:, 2, 2]
+    return uncertainty
+def strip_symmetric(sym):
+    return strip_lowerdiag(sym)
+def gaussian_3d_coeff(xyzs, covs):
+    # xyzs: [N, 3]
+    # covs: [N, 6]
+    x, y, z = xyzs[:, 0], xyzs[:, 1], xyzs[:, 2]
+    a, b, c, d, e, f = covs[:, 0], covs[:, 1], covs[:, 2], covs[:, 3], covs[:, 4], covs[:, 5]
+    # eps must be small enough !!!
+    inv_det = 1 / (a * d * f + 2 * e * c * b - e**2 * a - c**2 * d - b**2 * f + 1e-24)
+    inv_a = (d * f - e**2) * inv_det
+    inv_b = (e * c - b * f) * inv_det
+    inv_c = (e * b - c * d) * inv_det
+    inv_d = (a * f - c**2) * inv_det
+    inv_e = (b * c - e * a) * inv_det
+    inv_f = (a * d - b**2) * inv_det
+    power = -0.5 * (x**2 * inv_a + y**2 * inv_d + z**2 * inv_f) - x * y * inv_b - x * z * inv_c - y * z * inv_e
+    power[power > 0] = -1e10 # abnormal values... make weights 0
+    return torch.exp(power)
+def build_rotation(r):
+    norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])
+    q = r / norm[:, None]
+    R = torch.zeros((q.size(0), 3, 3), device='cuda')
+    r = q[:, 0]
+    x = q[:, 1]
+    y = q[:, 2]
+    z = q[:, 3]
+    R[:, 0, 0] = 1 - 2 * (y*y + z*z)
+    R[:, 0, 1] = 2 * (x*y - r*z)
+    R[:, 0, 2] = 2 * (x*z + r*y)
+    R[:, 1, 0] = 2 * (x*y + r*z)
+    R[:, 1, 1] = 1 - 2 * (x*x + z*z)
+    R[:, 1, 2] = 2 * (y*z - r*x)
+    R[:, 2, 0] = 2 * (x*z - r*y)
+    R[:, 2, 1] = 2 * (y*z + r*x)
+    R[:, 2, 2] = 1 - 2 * (x*x + y*y)
+    return R
+def build_scaling_rotation(s, r):
+    L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda")
+    R = build_rotation(r)
+    L[:,0,0] = s[:,0]
+    L[:,1,1] = s[:,1]
+    L[:,2,2] = s[:,2]
+    L = R @ L
+    return L
+class BasicPointCloud(NamedTuple):
+    points: np.array
+    colors: np.array
+    normals: np.array
+class GaussianModel:
+    def setup_functions(self):
+        def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation):
+            L = build_scaling_rotation(scaling_modifier * scaling, rotation)
+            actual_covariance = L @ L.transpose(1, 2)
+            symm = strip_symmetric(actual_covariance)
+            return symm
+        self.scaling_activation = torch.exp
+        self.scaling_inverse_activation = torch.log
+        self.covariance_activation = build_covariance_from_scaling_rotation
+        self.opacity_activation = torch.sigmoid
+        self.inverse_opacity_activation = inverse_sigmoid
+        self.rotation_activation = torch.nn.functional.normalize
+    def __init__(self, sh_degree : int):
+        self.active_sh_degree = 0
+        self.max_sh_degree = sh_degree
+        self._xyz = torch.empty(0)
+        self._features_dc = torch.empty(0)
+        self._features_rest = torch.empty(0)
+        self._scaling = torch.empty(0)
+        self._rotation = torch.empty(0)
+        self._opacity = torch.empty(0)
+        self.max_radii2D = torch.empty(0)
+        self.xyz_gradient_accum = torch.empty(0)
+        self.denom = torch.empty(0)
+        self.optimizer = None
+        self.percent_dense = 0
+        self.spatial_lr_scale = 0
+        self.setup_functions()
+    def capture(self):
+        return (
+            self.active_sh_degree,
+            self._xyz,
+            self._features_dc,
+            self._features_rest,
+            self._scaling,
+            self._rotation,
+            self._opacity,
+            self.max_radii2D,
+            self.xyz_gradient_accum,
+            self.denom,
+            self.optimizer.state_dict(),
+            self.spatial_lr_scale,
+        )
+    def restore(self, model_args, training_args):
+        (self.active_sh_degree,
+        self._xyz,
+        self._features_dc,
+        self._features_rest,
+        self._scaling,
+        self._rotation,
+        self._opacity,
+        self.max_radii2D,
+        xyz_gradient_accum,
+        denom,
+        opt_dict,
+        self.spatial_lr_scale) = model_args
+        self.training_setup(training_args)
+        self.xyz_gradient_accum = xyz_gradient_accum
+        self.denom = denom
+        self.optimizer.load_state_dict(opt_dict)
+    @property
+    def get_scaling(self):
+        return self.scaling_activation(self._scaling)
+    @property
+    def get_rotation(self):
+        return self.rotation_activation(self._rotation)
+    @property
+    def get_xyz(self):
+        return self._xyz
+    @property
+    def get_features(self):
+        features_dc = self._features_dc
+        features_rest = self._features_rest
+        if self.enable_dino:
+            return torch.cat((features_dc, features_rest[..., :3]), dim=1), features_rest[..., 3:].reshape(features_rest.shape[0], 1, -1)[..., :self.dino_feat_dim]
+        else:
+            return torch.cat((features_dc, features_rest), dim=1)
+    @property
+    def get_opacity(self):
+        return self.opacity_activation(self._opacity)
+    @torch.no_grad()
+    def extract_fields(self, resolution=128, num_blocks=16, relax_ratio=1.5):
+        # resolution: resolution of field
+        block_size = 2 / num_blocks
+        assert resolution % block_size == 0
+        split_size = resolution // num_blocks
+        opacities = self.get_opacity
+        # pre-filter low opacity gaussians to save computation
+        mask = (opacities > 0.005).squeeze(1)
+        opacities = opacities[mask]
+        xyzs = self.get_xyz[mask]
+        stds = self.get_scaling[mask]
+        # normalize to ~ [-1, 1]
+        mn, mx = xyzs.amin(0), xyzs.amax(0)
+        self.center = (mn + mx) / 2
+        self.scale = 1.8 / (mx - mn).amax().item()
+        xyzs = (xyzs - self.center) * self.scale
+        stds = stds * self.scale
+        covs = self.covariance_activation(stds, 1, self._rotation[mask])
+        # tile
+        device = opacities.device
+        occ = torch.zeros([resolution] * 3, dtype=torch.float32, device=device)
+        X = torch.linspace(-1, 1, resolution).split(split_size)
+        Y = torch.linspace(-1, 1, resolution).split(split_size)
+        Z = torch.linspace(-1, 1, resolution).split(split_size)
+        # loop blocks (assume max size of gaussian is small than relax_ratio * block_size !!!)
+        for xi, xs in enumerate(X):
+            for yi, ys in enumerate(Y):
+                for zi, zs in enumerate(Z):
+                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
+                    # sample points [M, 3]
+                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1).to(device)
+                    # in-tile gaussians mask
+                    vmin, vmax = pts.amin(0), pts.amax(0)
+                    vmin -= block_size * relax_ratio
+                    vmax += block_size * relax_ratio
+                    mask = (xyzs < vmax).all(-1) & (xyzs > vmin).all(-1)
+                    # if hit no gaussian, continue to next block
+                    if not mask.any():
+                        continue
+                    mask_xyzs = xyzs[mask] # [L, 3]
+                    mask_covs = covs[mask] # [L, 6]
+                    mask_opas = opacities[mask].view(1, -1) # [L, 1] --> [1, L]
+                    # query per point-gaussian pair.
+                    g_pts = pts.unsqueeze(1).repeat(1, mask_covs.shape[0], 1) - mask_xyzs.unsqueeze(0) # [M, L, 3]
+                    g_covs = mask_covs.unsqueeze(0).repeat(pts.shape[0], 1, 1) # [M, L, 6]
+                    # batch on gaussian to avoid OOM
+                    batch_g = 1024
+                    val = 0
+                    for start in range(0, g_covs.shape[1], batch_g):
+                        end = min(start + batch_g, g_covs.shape[1])
+                        w = gaussian_3d_coeff(g_pts[:, start:end].reshape(-1, 3), g_covs[:, start:end].reshape(-1, 6)).reshape(pts.shape[0], -1) # [M, l]
+                        val += (mask_opas[:, start:end] * w).sum(-1)
+                    # kiui.lo(val, mask_opas, w)
+                    occ[xi * split_size: xi * split_size + len(xs),
+                        yi * split_size: yi * split_size + len(ys),
+                        zi * split_size: zi * split_size + len(zs)] = val.reshape(len(xs), len(ys), len(zs))
+        kiui.lo(occ, verbose=1)
+        return occ
+    def extract_mesh(self, path, density_thresh=1, resolution=128, decimate_target=1e5):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        occ = self.extract_fields(resolution).detach().cpu().numpy()
+        import mcubes
+        vertices, triangles = mcubes.marching_cubes(occ, density_thresh)
+        vertices = vertices / (resolution - 1.0) * 2 - 1
+        # transform back to the original space
+        vertices = vertices / self.scale + self.center.detach().cpu().numpy()
+        vertices, triangles = clean_mesh(vertices, triangles, remesh=True, remesh_size=0.015)
+        if decimate_target > 0 and triangles.shape[0] > decimate_target:
+            vertices, triangles = decimate_mesh(vertices, triangles, decimate_target)
+        v = torch.from_numpy(vertices.astype(np.float32)).contiguous().cuda()
+        f = torch.from_numpy(triangles.astype(np.int32)).contiguous().cuda()
+        print(
+            f"[INFO] marching cubes result: {v.shape} ({v.min().item()}-{v.max().item()}), {f.shape}"
+        )
+        mesh = Mesh(v=v, f=f, device='cuda')
+        return mesh
+    def get_covariance(self, scaling_modifier = 1):
+        return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation)
+    def oneupSHdegree(self):
+        if self.active_sh_degree < self.max_sh_degree:
+            self.active_sh_degree += 1
+    def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float = 1):
+        self.spatial_lr_scale = spatial_lr_scale
+        fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda()
+        fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda())
+        features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda()
+        features[:, :3, 0 ] = fused_color
+        features[:, 3:, 1:] = 0.0
+        print("Number of points at initialisation : ", fused_point_cloud.shape[0])
+        dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001)
+        scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3)
+        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
+        rots[:, 0] = 1
+        opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"))
+        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
+        self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True))
+        if self.enable_dino:
+            # Overide the original features
+            _features_rest = features[:,:,1:].transpose(1, 2).contiguous().cuda()
+            dim_rest = _features_rest.shape[1]
+            _semantic_features = torch.randn(self._xyz.shape[0], dim_rest, self.dino_feat_dim//dim_rest + 1).cuda()
+            self._features_rest = nn.Parameter(torch.cat([_features_rest, _semantic_features], dim=-1).requires_grad_(True))
+        else:
+            self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True))
+        self._scaling = nn.Parameter(scales.requires_grad_(True))
+        self._rotation = nn.Parameter(rots.requires_grad_(True))
+        self._opacity = nn.Parameter(opacities.requires_grad_(True))
+        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
+    def training_setup(self, training_args):
+        self.percent_dense = training_args.percent_dense
+        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        l = [
+            {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"},
+            {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"},
+            {'params': [self._features_rest], 'lr': training_args.feature_lr / 20, "name": "f_rest"},  # /20
+            {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"},
+            {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"},
+            {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"},
+        ]
+        if training_args.opt_cam:
+            l.append({'params': self.cam_params, 'lr': training_args.camera_lr, "name": "cam_params"})
+        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
+        self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale,
+                                                    lr_final=training_args.position_lr_final*self.spatial_lr_scale,
+                                                    lr_delay_mult=training_args.position_lr_delay_mult,
+                                                    max_steps=training_args.position_lr_max_steps)
+    def update_learning_rate(self, iteration):
+        ''' Learning rate scheduling per step '''
+        for param_group in self.optimizer.param_groups:
+            if param_group["name"] == "xyz":
+                if iteration > 500:
+                    iteration = iteration % 500
+                lr = self.xyz_scheduler_args(iteration)
+                param_group['lr'] = lr
+    def construct_list_of_attributes(self):
+        l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
+        # All channels except the 3 DC
+        for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]):
+            l.append('f_dc_{}'.format(i))
+        for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]):
+            l.append('f_rest_{}'.format(i))
+        l.append('opacity')
+        for i in range(self._scaling.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(self._rotation.shape[1]):
+            l.append('rot_{}'.format(i))
+        return l
+    def save_ply(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        xyz = self._xyz.detach().cpu().numpy()
+        normals = np.zeros_like(xyz)
+        f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = self._opacity.detach().cpu().numpy()
+        scale = self._scaling.detach().cpu().numpy()
+        rotation = self._rotation.detach().cpu().numpy()
+        dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()]
+        elements = np.empty(xyz.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)
+    def reset_opacity(self):
+        opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01))
+        optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
+        self._opacity = optimizable_tensors["opacity"]
+    def load_ply(self, path):
+        plydata = PlyData.read(path)
+        xyz = np.stack((np.asarray(plydata.elements[0]["x"]),
+                        np.asarray(plydata.elements[0]["y"]),
+                        np.asarray(plydata.elements[0]["z"])),  axis=1)
+        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]
+        print("Number of points at loading : ", xyz.shape[0])
+        features_dc = np.zeros((xyz.shape[0], 3, 1))
+        features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
+        features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"])
+        features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"])
+        extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")]
+        assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3
+        features_extra = np.zeros((xyz.shape[0], len(extra_f_names)))
+        for idx, attr_name in enumerate(extra_f_names):
+            features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC)
+        features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1))
+        scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")]
+        scales = np.zeros((xyz.shape[0], len(scale_names)))
+        for idx, attr_name in enumerate(scale_names):
+            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")]
+        rots = np.zeros((xyz.shape[0], len(rot_names)))
+        for idx, attr_name in enumerate(rot_names):
+            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True))
+        self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True))
+        self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True))
+        self.active_sh_degree = self.max_sh_degree
+    def replace_tensor_to_optimizer(self, tensor, name):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            if len(group["params"]) != 1:
+                continue
+            if group["name"] == name:
+                stored_state = self.optimizer.state.get(group['params'][0], None)
+                stored_state["exp_avg"] = torch.zeros_like(tensor)
+                stored_state["exp_avg_sq"] = torch.zeros_like(tensor)
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def _prune_optimizer(self, mask):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            if len(group["params"]) != 1:
+                continue
+            stored_state = self.optimizer.state.get(group['params'][0], None)
+            if stored_state is not None:
+                stored_state["exp_avg"] = stored_state["exp_avg"][mask]
+                stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True)))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+            else:
+                group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True))
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def prune_points(self, mask):
+        valid_points_mask = ~mask
+        optimizable_tensors = self._prune_optimizer(valid_points_mask)
+        self._xyz = optimizable_tensors["xyz"]
+        self._features_dc = optimizable_tensors["f_dc"]
+        self._features_rest = optimizable_tensors["f_rest"]
+        self._opacity = optimizable_tensors["opacity"]
+        self._scaling = optimizable_tensors["scaling"]
+        self._rotation = optimizable_tensors["rotation"]
+        self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]
+        self.denom = self.denom[valid_points_mask]
+        self.max_radii2D = self.max_radii2D[valid_points_mask]
+    def cat_tensors_to_optimizer(self, tensors_dict):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            if len(group["params"]) != 1:
+                continue
+            assert len(group["params"]) == 1
+            extension_tensor = tensors_dict[group["name"]]
+            stored_state = self.optimizer.state.get(group['params'][0], None)
+            if stored_state is not None:
+                stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0)
+                stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0)
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+            else:
+                group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation):
+        d = {"xyz": new_xyz,
+        "f_dc": new_features_dc,
+        "f_rest": new_features_rest,
+        "opacity": new_opacities,
+        "scaling" : new_scaling,
+        "rotation" : new_rotation}
+        optimizable_tensors = self.cat_tensors_to_optimizer(d)
+        self._xyz = optimizable_tensors["xyz"]
+        self._features_dc = optimizable_tensors["f_dc"]
+        self._features_rest = optimizable_tensors["f_rest"]
+        self._opacity = optimizable_tensors["opacity"]
+        self._scaling = optimizable_tensors["scaling"]
+        self._rotation = optimizable_tensors["rotation"]
+        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
+    def densify_and_split(self, grads, grad_threshold, scene_extent, N=2):
+        n_init_points = self.get_xyz.shape[0]
+        # Extract points that satisfy the gradient condition
+        padded_grad = torch.zeros((n_init_points), device="cuda")
+        padded_grad[:grads.shape[0]] = grads.squeeze()
+        selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False)
+        selected_pts_mask = torch.logical_and(selected_pts_mask,
+            torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent
+        )
+        stds = self.get_scaling[selected_pts_mask].repeat(N,1)
+        means =torch.zeros((stds.size(0), 3),device="cuda")
+        samples = torch.normal(mean=means, std=stds)
+        rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1)
+        new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1)
+        new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N))
+        new_rotation = self._rotation[selected_pts_mask].repeat(N,1)
+        new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1)
+        new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1)
+        new_opacity = self._opacity[selected_pts_mask].repeat(N,1)
+        self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation)
+        prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool)))
+        self.prune_points(prune_filter)
+    def densify_and_clone(self, grads, grad_threshold, scene_extent):
+        # Extract points that satisfy the gradient condition
+        selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False)
+        selected_pts_mask = torch.logical_and(selected_pts_mask,
+            torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent
+        )
+        new_xyz = self._xyz[selected_pts_mask]
+        new_features_dc = self._features_dc[selected_pts_mask]
+        new_features_rest = self._features_rest[selected_pts_mask]
+        new_opacities = self._opacity[selected_pts_mask]
+        new_scaling = self._scaling[selected_pts_mask]
+        new_rotation = self._rotation[selected_pts_mask]
+        self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation)
+    def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size):
+        grads = self.xyz_gradient_accum / self.denom
+        grads[grads.isnan()] = 0.0
+        self.densify_and_clone(grads, max_grad, extent)
+        self.densify_and_split(grads, max_grad, extent)
+        prune_mask = (self.get_opacity < min_opacity).squeeze()
+        if max_screen_size:
+            big_points_vs = self.max_radii2D > max_screen_size
+            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
+            prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
+        self.prune_points(prune_mask)
+        torch.cuda.empty_cache()
+    def prune(self, min_opacity, extent, max_screen_size):
+        prune_mask = (self.get_opacity < min_opacity).squeeze()
+        if max_screen_size:
+            big_points_vs = self.max_radii2D > max_screen_size
+            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
+            prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
+        self.prune_points(prune_mask)
+        torch.cuda.empty_cache()
+    def add_densification_stats(self, viewspace_point_tensor, update_filter):
+        self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor.grad[update_filter,:2], dim=-1, keepdim=True)
+        self.denom[update_filter] += 1
+def getProjectionMatrix(znear, zfar, fx, fy, cx, cy):
+    # TODO: remove hard-coded image size
+    P = torch.zeros(4, 4)
+    z_sign = 1.0
+    P[0, 0] = 2 * fx / 256
+    P[1, 1] = 2 * fy / 256
+    P[0, 2] = 2 * (cx / 256) - 1
+    P[1, 2] = 2 * (cy / 256) - 1
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[3, 2] = z_sign
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+def getProjectionMatrixFoV(znear, zfar, fovX, fovY):
+    tanHalfFovY = math.tan((fovY / 2))
+    tanHalfFovX = math.tan((fovX / 2))
+    P = torch.zeros(4, 4)
+    z_sign = 1.0
+    P[0, 0] = 1 / tanHalfFovX
+    P[1, 1] = 1 / tanHalfFovY
+    P[3, 2] = z_sign
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+class Camera:
+    def __init__(self, c2w, width, height, fx, fy, cx, cy, znear=0.01, zfar=100, opt_pose=False):
+        # c2w (pose) should be in NeRF convention.
+        self.image_width = width
+        self.image_height = height
+        self.fx, self.fy = fx, fy
+        self.cx, self.cy = cx, cy
+        self.FoVy = 2 * np.arctan(256 / 2 / self.fy)
+        self.FoVx = 2 * np.arctan(256 / 2 / self.fx)
+        self.znear = znear
+        self.zfar = zfar
+        self.opt_pose = opt_pose
+        self.projection_matrix = (
+            getProjectionMatrix(
+                znear=self.znear,
+                zfar=self.zfar,
+                fx=self.fx,
+                fy=self.fy,
+                cx=self.cx,
+                cy=self.cy,
+            )
+            .transpose(0, 1)
+            .cuda()
+        )
+        w2c = np.linalg.inv(c2w)
+        # OpenGL to OpenCV
+        w2c[1:3] *= -1
+        self.world_view_transform = torch.tensor(w2c).transpose(0, 1).cuda()
+        self.full_proj_transform = self.world_view_transform @ self.projection_matrix
+        self.camera_center = torch.tensor(c2w[:3, 3]).cuda()
+class FoVCamera:
+    def __init__(self, c2w, width, height, fovy, fovx, znear, zfar, cam_params=None, opt_pose=False):
+        # c2w (pose) should be in NeRF convention.
+        self.image_width = width
+        self.image_height = height
+        self.FoVy = fovy
+        self.FoVx = fovx
+        self.znear = znear
+        self.zfar = zfar
+        self.opt_pose = opt_pose
+        self.projection_matrix = (
+            getProjectionMatrixFoV(
+                znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy
+            )
+            .transpose(0, 1)
+            .cuda()
+        )
+        w2c = np.linalg.inv(c2w)
+        # OpenGL to OpenCV
+        w2c[1:3] *= -1
+        self.world_view_transform = torch.tensor(w2c).transpose(0, 1).cuda()
+        self.full_proj_transform = self.world_view_transform @ self.projection_matrix
+        self.camera_center = torch.tensor(c2w[:3, 3]).cuda()
+class CustomCamera:
+    def __init__(self, cam_params=None, index=None, c2w=None, opt_pose=False):
+        # TODO: remove hard-coded image size
+        # c2w (pose) should be in NeRF convention.
+        # This this the camera class that supports pose optimization.
+        self.image_width, self.image_height = 256, 256
+        self.fx, self.fy = cam_params["focal_length"]
+        self.cx, self.cy = cam_params["principal_point"]
+        self.FoVy = 2 * np.arctan(self.image_height / 2 / self.fy)
+        self.FoVx = 2 * np.arctan(self.image_width / 2 / self.fx)
+        self.R = torch.tensor(cam_params["R"])
+        self.T = torch.tensor(cam_params["T"])
+        self.znear = 0.01
+        self.zfar = 100
+        self.opt_pose = opt_pose
+        self.index = index
+        self.projection_matrix = (
+            getProjectionMatrix(
+                znear=self.znear,
+                zfar=self.zfar,
+                fx=self.fx,
+                fy=self.fy,
+                cx=self.cx,
+                cy=self.cy,
+            )
+            .transpose(0, 1)
+            .cuda()
+        )
+        if not opt_pose:
+            if c2w:
+                w2c = torch.from_numpy(c2w)
+                w2c[1:3] *= -1  # OpenGL to OpenCV
+            else:
+                R = self.R.T  # note the transpose here
+                T = self.T
+                upper = torch.cat([R, T[:, None]], dim=1)  # Upper 3x4 part of the matrix
+                lower = torch.tensor([[0, 0, 0, 1]], device=R.device, dtype=R.dtype)  # Last row
+                w2c = torch.cat([upper, lower], dim=0)
+                w2c[:2] *= -1  # PyTorch3D to OpenCV
+            self.w2c = w2c
+            self.cam_params = torch.zeros(6)
+            self.world_view_transform = w2c.transpose(0, 1).cuda()
+            self.full_proj_transform = self.world_view_transform @ self.projection_matrix
+            self.camera_center = self.world_view_transform.inverse()[3, :3]
+        else:
+            R = self.R.T  # note the transpose here
+            T = self.T
+            upper = torch.cat([R, T[:, None]], dim=1)  # Upper 3x4 part of the matrix
+            lower = torch.tensor([[0, 0, 0, 1]], device=R.device, dtype=R.dtype)  # Last row
+            w2c = torch.cat([upper, lower], dim=0)
+            w2c[:2] *= -1  # PyTorch3D to OpenCV
+            self.w2c = w2c
+            self.cam_params = torch.randn(6) * 1e-6
+            self.cam_params.requires_grad_()
+            self.world_view_transform = w2c.transpose(0, 1).cuda()
+            self.full_proj_transform = self.world_view_transform @ self.projection_matrix
+            self.camera_center = self.world_view_transform.inverse()[3, :3]
+    @property
+    def perspective(self):
+        P = torch.zeros(4, 4)
+        z_sign = -1.0
+        P[0, 0] = 2 * self.fx / 256
+        P[1, 1] = -2 * self.fy / 256
+        P[0, 2] = -(2 * (self.cx / 256) - 1)
+        P[1, 2] = -(2 * (self.cy / 256) - 1)
+        P[2, 2] = z_sign * self.zfar / (self.zfar - self.znear)
+        P[3, 2] = z_sign
+        P[2, 3] = -(self.zfar * self.znear) / (self.zfar - self.znear)
+        return P.numpy()
+    @property
+    def c2w(self):
+        if self.opt_pose:
+            w2c = self.w2c @ SE3.exp(self.cam_params.detach()).as_matrix()
+            w2c[1:3] *= -1  # OpenCV to OpenGL
+        else:
+            R = self.R.T  # note the transpose here
+            T = self.T
+            upper = torch.cat([R, T[:, None]], dim=1)  # Upper 3x4 part of the matrix
+            lower = torch.tensor([[0, 0, 0, 1]], device=R.device, dtype=R.dtype)  # Last row
+            w2c = torch.cat([upper, lower], dim=0)
+            w2c[:2, :] *= -1  # PyTorch3D to OpenCV
+            w2c[1:3, :] *= -1  # OpenCV to OpenGL
+        return torch.inverse(w2c).numpy()
+    @property
+    def focal_length(self):
+        return np.array([self.fx, self.fy])
+    @property
+    def rotation(self):
+        w2c = self.w2c @ SE3.exp(self.cam_params.detach()).as_matrix()
+        w2c[:2] *= -1
+        return w2c[:3, :3].T
+    @property
+    def translation(self):
+        w2c = self.w2c @ SE3.exp(self.cam_params.detach()).as_matrix()
+        w2c[:2] *= -1
+        return w2c[:3, 3]
+class Renderer:
+    def __init__(self, sh_degree=3, white_background=True, radius=1):
+        self.sh_degree = sh_degree
+        self.white_background = white_background
+        self.radius = radius
+        self.enable_dino = None
+        self.gaussians = GaussianModel(sh_degree)
+        self.bg_color = torch.tensor(
+            [1, 1, 1] if white_background else [0, 0, 0],
+            dtype=torch.float32,
+            device="cuda",
+        )
+    def initialize(self, input=None, num_pts=5000, radius=0.5, cameras=None, imgs=None, masks=None, point_maps=None, mode='sphere'):
+        # load checkpoint
+        if input is None and mode in ['sphere', "carve", "inverse_carve"]:
+            # init from random point cloud
+            if mode == 'sphere':
+                phis = np.random.random((num_pts,)) * 2 * np.pi
+                costheta = np.random.random((num_pts,)) * 2 - 1
+                thetas = np.arccos(costheta)
+                mu = np.random.random((num_pts,))
+                radius = radius * np.cbrt(mu)
+                x = radius * np.sin(thetas) * np.cos(phis)
+                y = radius * np.sin(thetas) * np.sin(phis)
+                z = radius * np.cos(thetas)
+                xyz = np.stack((x, y, z), axis=1)
+            elif mode == "carve":
+                try:
+                    xyz = sample_points_from_voxel(cameras, masks, radius, N=num_pts).cpu().numpy()
+                except RuntimeError:
+                    radius = 0.3
+                    phis = np.random.random((num_pts,)) * 2 * np.pi
+                    costheta = np.random.random((num_pts,)) * 2 - 1
+                    thetas = np.arccos(costheta)
+                    mu = np.random.random((num_pts,))
+                    radius = radius * np.cbrt(mu)
+                    x = radius * np.sin(thetas) * np.cos(phis)
+                    y = radius * np.sin(thetas) * np.sin(phis)
+                    z = radius * np.cos(thetas)
+                    xyz = np.stack((x, y, z), axis=1)
+            elif mode == "inverse_carve":
+                xyz = sample_points_from_voxel(cameras, masks, radius, N=num_pts, inverse=True).cpu().numpy()
+            shs = np.random.random((num_pts, 3)) / 255.0
+            pcd = BasicPointCloud(
+                points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))
+            )
+            self.gaussians.create_from_pcd(pcd, 10)
+        elif input is None and mode == "dust3r":
+            num_points = sum([np.count_nonzero(masks[i]) for i in range(8)])
+            xyz = np.zeros((num_points, 3))
+            colors = np.zeros((num_points, 3))
+            # Iterate through data and add points to xyz and colors arrays
+            index = 0
+            for i in range(len(point_maps)):
+                rgb = imgs[i].reshape(-1, 3)
+                point_map = point_maps[i].reshape(-1, 3).detach().cpu().numpy()
+                for j, include_point in enumerate((masks[i] > 0.5).flatten()):
+                    if include_point == 1:
+                        xyz[index] = point_map[j]
+                        colors[index] = rgb[j]
+                        index += 1
+            # Check if index matches expected number of points
+            assert index == num_points, "Number of points does not match expected count"
+            pcd = BasicPointCloud(
+                points=xyz, colors=colors, normals=np.zeros((len(point_maps)*224*224, 3))
+            )
+            self.gaussians.create_from_pcd(pcd, 10)
+        elif isinstance(input, BasicPointCloud):
+            # load from a provided pcd
+            self.gaussians.create_from_pcd(input, 1)
+        else:
+            # load from saved ply
+            self.gaussians.load_ply(input)
+    def render(
+        self,
+        viewpoint_camera,
+        scaling_modifier=1.0,
+        bg_color=None,
+        override_color=None,
+        compute_cov3D_python=False,
+        convert_SHs_python=False,
+    ):
+        if self.enable_dino:
+            from diff_gaussian_rasterization_feature import (
+                GaussianRasterizationSettings,
+                GaussianRasterizer,
+            )
+        else:
+            from diff_gaussian_rasterization import (
+                GaussianRasterizationSettings,
+                GaussianRasterizer,
+            )
+        if viewpoint_camera.opt_pose:
+            w2c = viewpoint_camera.w2c @ SE3.exp(viewpoint_camera.cam_params).as_matrix()
+            w2c = w2c.to("cuda")
+            viewpoint_camera.world_view_transform = w2c.transpose(0, 1)
+            viewpoint_camera.full_proj_transform = viewpoint_camera.world_view_transform @ viewpoint_camera.projection_matrix
+            viewpoint_camera.camera_center = viewpoint_camera.world_view_transform.inverse()[3, :3]
+        # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
+        screenspace_points = (
+            torch.zeros_like(
+                self.gaussians.get_xyz,
+                dtype=self.gaussians.get_xyz.dtype,
+                requires_grad=True,
+                device="cuda",
+            )
+            + 0
+        )
+        try:
+            screenspace_points.retain_grad()
+        except:
+            pass
+        # Set up rasterization configuration
+        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+        raster_settings = GaussianRasterizationSettings(
+            image_height=int(viewpoint_camera.image_height),
+            image_width=int(viewpoint_camera.image_width),
+            tanfovx=tanfovx,
+            tanfovy=tanfovy,
+            bg=self.bg_color if bg_color is None else bg_color,
+            scale_modifier=scaling_modifier,
+            viewmatrix=viewpoint_camera.world_view_transform,
+            perspectivematrix=viewpoint_camera.projection_matrix,
+            projmatrix=viewpoint_camera.full_proj_transform,
+            sh_degree=self.gaussians.active_sh_degree,
+            campos=viewpoint_camera.camera_center,
+            prefiltered=False,
+            debug=False,
+        )
+        rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+        means3D = self.gaussians.get_xyz
+        means2D = screenspace_points
+        opacity = self.gaussians.get_opacity
+        # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
+        # scaling / rotation by the rasterizer.
+        scales = None
+        rotations = None
+        cov3D_precomp = None
+        if compute_cov3D_python:
+            cov3D_precomp = self.gaussians.get_covariance(scaling_modifier)
+        else:
+            scales = self.gaussians.get_scaling
+            rotations = self.gaussians.get_rotation
+        # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
+        # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
+        shs = None
+        colors_precomp = None
+        if colors_precomp is None:
+            if convert_SHs_python:
+                shs_view = self.gaussians.get_features.transpose(1, 2).view(
+                    -1, 3, (self.gaussians.max_sh_degree + 1) ** 2
+                )
+                dir_pp = self.gaussians.get_xyz - viewpoint_camera.camera_center.repeat(
+                    self.gaussians.get_features.shape[0], 1
+                )
+                dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True)
+                sh2rgb = eval_sh(
+                    self.gaussians.active_sh_degree, shs_view, dir_pp_normalized
+                )
+                colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
+            else:
+                shs = self.gaussians.get_features
+        else:
+            colors_precomp = override_color
+        if self.enable_dino:
+            shs, semantic_feature = shs
+            rendered_image, rendered_feature, radii, rendered_depth, rendered_alpha = rasterizer(
+                means3D=means3D,
+                means2D=means2D,
+                shs=shs,
+                semantic_feature=semantic_feature,
+                colors_precomp=colors_precomp,
+                opacities=opacity,
+                scales=scales,
+                rotations=rotations,
+                cov3D_precomp=cov3D_precomp,
+                viewmat=viewpoint_camera.world_view_transform,
+            )
+        else:
+            # Rasterize visible Gaussians to image, obtain their radii (on screen).
+            rendered_image, radii, rendered_depth, rendered_alpha = rasterizer(
+                means3D=means3D,
+                means2D=means2D,
+                shs=shs,
+                colors_precomp=colors_precomp,
+                opacities=opacity,
+                scales=scales,
+                rotations=rotations,
+                cov3D_precomp=cov3D_precomp,
+                viewmat=viewpoint_camera.world_view_transform,
+            )
+        rendered_image = rendered_image.clamp(0, 1)
+        # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
+        # They will be excluded from value updates used in the splitting criteria.
+        ret = {
+            "image": rendered_image,
+            "depth": rendered_depth,
+            "alpha": rendered_alpha,
+            "viewspace_points": screenspace_points,
+            "visibility_filter": radii > 0,
+            "radii": radii,
+        }
+        if self.enable_dino:
+            ret["feature"] = rendered_feature
+        return ret

sparseags/render_utils/util.py ADDED Viewed

	@@ -0,0 +1,510 @@

+import os
+import cv2
+import gc
+import copy
+import tqdm
+import torchvision
+import shutil
+import argparse
+import numpy as np
+from PIL import Image
+from torchvision.utils import save_image
+from omegaconf import OmegaConf
+import matplotlib.pyplot as plt
+import torch
+import torch.nn.functional as F
+from kiui.lpips import LPIPS
+from liegroups.torch import SE3
+import sys
+sys.path.append('./')
+from sparseags.render_utils.gs_renderer import CustomCamera
+from sparseags.mesh_utils.mesh_renderer import Renderer
+from sparseags.cam_utils import OrbitCamera, mat2latlon
+def safe_normalize(x):
+	return x / x.norm(p=2, dim=-1, keepdim=True).clamp(min=1e-8)
+def look_at(campos, target, opengl=True):
+	if not opengl:
+		forward_vector = safe_normalize(target - campos)
+		up_vector = torch.tensor([0, 1, 0], dtype=campos.dtype, device=campos.device).expand_as(forward_vector)
+		right_vector = safe_normalize(torch.cross(forward_vector, up_vector, dim=-1))
+		up_vector = safe_normalize(torch.cross(right_vector, forward_vector, dim=-1))
+	else:
+		forward_vector = safe_normalize(campos - target)
+		up_vector = torch.tensor([0, 1, 0], dtype=campos.dtype, device=campos.device).expand_as(forward_vector)
+		right_vector = safe_normalize(torch.cross(up_vector, forward_vector, dim=-1))
+		up_vector = safe_normalize(torch.cross(forward_vector, right_vector, dim=-1))
+	R = torch.stack([right_vector, up_vector, forward_vector], dim=-1)
+	return R
+def orbit_camera(elevation, azimuth, radius=1.0, is_degree=True, target=None, opengl=True):
+	"""Converts elevation & azimuth to a batch of camera pose matrices."""
+	if is_degree:
+		elevation = torch.deg2rad(elevation)
+		azimuth = torch.deg2rad(azimuth)
+	x = radius * torch.cos(elevation) * torch.sin(azimuth)
+	y = -radius * torch.sin(elevation)
+	z = radius * torch.cos(elevation) * torch.cos(azimuth)
+	if target is None:
+		target = torch.zeros(3, dtype=torch.float32, device=elevation.device)
+	campos = torch.stack([x, y, z], dim=-1) + target
+	R = look_at(campos, target.unsqueeze(0).expand_as(campos), opengl)
+	T = torch.eye(4, dtype=torch.float32, device=elevation.device).unsqueeze(0).expand(campos.shape[0], -1, -1).clone()
+	T[:, :3, :3] = R
+	T[:, :3, 3] = campos
+	return T
+def render_and_compare(camera_data, mesh_path, save_path, num_views=8):
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--object', type=str, help="path to mesh (obj, ply, glb, ...)")
+	parser.add_argument('--path', type=str, help="path to mesh (obj, ply, glb, ...)")
+	parser.add_argument('--front_dir', type=str, default='+z', help="mesh front-facing dir")
+	parser.add_argument('--mode', default='albedo', type=str, choices=['lambertian', 'albedo', 'normal', 'depth'], help="rendering mode")
+	parser.add_argument('--W', type=int, default=256, help="GUI width")
+	parser.add_argument('--H', type=int, default=256, help="GUI height")
+	parser.add_argument("--wogui", type=bool, default=True, help="disable all dpg GUI")
+	parser.add_argument("--force_cuda_rast", action='store_true', help="force to use RasterizeCudaContext.")
+	parser.add_argument("--config", default='configs/navi.yaml', help="path to the yaml config file")
+	parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
+	parser.add_argument('--fovy', type=float, default=49.1, help="default GUI camera fovy")
+	args, extras = parser.parse_known_args()
+	# override default config from cli
+	opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+	data = camera_data
+	opt.mesh = mesh_path
+	opt.trainable_texture = False
+	renderer = Renderer(opt).to(torch.device("cuda"))
+	target = renderer.mesh.v.mean(dim=0)
+	cameras = [CustomCamera(cam_params) for cam_params in data.values()]
+	# cams = [(cam.c2w, cam.perspective, cam.focal_length) for cam in cameras]
+	img_paths = [v["filepath"] for k, v in data.items()]
+	flags = [int(v["flag"]) for k, v in data.items()]
+	cam_centers = [mat2latlon(cam.camera_center - target) for idx, cam in enumerate(cameras) if flags[idx]]
+	ref_polars = [float(cam[0]) for cam in cam_centers]
+	ref_azimuths = [float(cam[1]) for cam in cam_centers]
+	ref_radii = [float(cam[2]) for cam in cam_centers]
+	base_cam = copy.copy(cameras[0])
+	base_cam.fx = np.array([cam.fx for idx, cam in enumerate(cameras) if flags[idx]], dtype=np.float32).mean()
+	base_cam.fy = np.array([cam.fy for idx, cam in enumerate(cameras) if flags[idx]], dtype=np.float32).mean()
+	base_cam.cx = 128
+	base_cam.cy = 128
+	lpips_loss = LPIPS(net='vgg').cuda()
+	elevation_range = (max([min(ref_polars) - 20, -89.9]), min([max(ref_polars) + 20, 89.9]))
+	azimuth_range = (-180, 180)
+	radius_range = (min(ref_radii) - 0.2, max(ref_radii) + 0.2)
+	elevation_steps = torch.arange(elevation_range[0], elevation_range[1], 15, dtype=torch.float32)
+	azimuth_steps = torch.arange(azimuth_range[0], azimuth_range[1], 15, dtype=torch.float32)
+	radius_steps = torch.arange(radius_range[0], radius_range[1], 0.2, dtype=torch.float32)
+	elevation_grid, azimuth_grid, radius_grid = torch.meshgrid(elevation_steps, azimuth_steps, radius_steps, indexing='ij')
+	pose_grid = torch.stack((elevation_grid.flatten(), azimuth_grid.flatten(), radius_grid.flatten()), dim=1)
+	poses = orbit_camera(pose_grid[:, 0], pose_grid[:, 1], pose_grid[:, 2], target=target.cpu())
+	print("Number of hypotheses:", poses.shape[0])
+	s1_steps = 128
+	s2_steps = 256
+	beta = 0.25
+	chunk_size = 512
+	for i in tqdm.tqdm(range(num_views)):
+		if flags[i]:
+			continue
+		pose_grid = torch.stack((elevation_grid.flatten(), azimuth_grid.flatten(), radius_grid.flatten()), dim=1)
+		poses = orbit_camera(pose_grid[:, 0], pose_grid[:, 1], pose_grid[:, 2], target=target.cpu())
+		img_path = img_paths[i]
+		base_cam.fx = cameras[i].fx
+		base_cam.fy = cameras[i].fy
+		perspectives = torch.from_numpy(base_cam.perspective).expand(pose_grid.shape[0], -1, -1)
+		learnable_cam_params = torch.randn(pose_grid.shape[0], 6) * 1e-6
+		learnable_cam_params.requires_grad_()
+		loss_MSE_grid = np.zeros(pose_grid.shape[0])
+		loss_LPIPS_grid = np.zeros(pose_grid.shape[0])
+		loss = 0
+		gt_img = Image.open(img_path)
+		if gt_img.mode == 'RGBA':
+			gt_img = np.asarray(gt_img, dtype=np.uint8).copy()
+			gt_mask = (gt_img[..., 3:] > 128).astype(np.float32)
+			gt_img[gt_img[:, :, -1] <= 255*0.9] = [255., 255., 255., 255.] # thresholding background
+			gt_img = gt_img[:, :, :3]
+		gt_tensor = torch.from_numpy(gt_img).float().unsqueeze(0).cuda() / 255.
+		gt_mask_tensor = torch.from_numpy(gt_mask).float().unsqueeze(0).cuda()
+		num_batches = pose_grid.shape[0] // chunk_size + int(pose_grid.shape[0]%chunk_size > 0)
+		# Render images for visualization
+		vis_img = torch.zeros(pose_grid.shape[0], 256, 256, 3)
+		for j in tqdm.tqdm(range(num_batches)):
+			batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+			batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+			with torch.no_grad():
+				out = renderer.render_batch(batch_poses, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+			# batch_image = (out["image"].detach().cpu().numpy() * 255).astype(np.uint8)
+			batch_image = out["image"].detach().cpu()
+			vis_img[j*chunk_size:(j+1)*chunk_size] = batch_image
+		l = [{'params': learnable_cam_params, 'lr': 5e-3, "name": "cam_params"}]
+		optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
+		scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
+		init_lr = optimizer.param_groups[0]['lr']
+		for j in tqdm.tqdm(range(num_batches)):
+			batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+			batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+			optimizer.param_groups[0]['lr'] = init_lr
+			for k in tqdm.tqdm(range(s1_steps)):
+				batch_residuals = SE3.exp(learnable_cam_params[j*chunk_size:(j+1)*chunk_size]).as_matrix() # [5760, 4, 4]
+				batch_poses_opt = torch.bmm(batch_poses, batch_residuals)
+				out = renderer.render_batch(batch_poses_opt, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+				pred_tensor = out["image"]
+				valid_mask = (out["alpha"] > 0) & (out["viewcos"] > 0.5)  # (500, 256, 256, 1)
+				if k == s1_steps - 1:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					loss_MSE_grid[j*chunk_size:(j+1)*chunk_size] = loss.detach().cpu().numpy()
+					loss = loss.mean()
+				else:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='mean')
+				loss.backward()
+				optimizer.step()
+				optimizer.zero_grad()
+				scheduler.step()
+		# Render optimized images for visualization
+		# vis_img_optimized = torch.zeros(pose_grid.shape[0], 256, 256, 3)
+		# for j in tqdm.tqdm(range(num_batches)):
+		# 	batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+		# 	batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+		# 	batch_residuals = SE3.exp(learnable_cam_params[j*chunk_size:(j+1)*chunk_size]).as_matrix() # [5760, 4, 4]
+		# 	batch_poses_opt = torch.bmm(batch_poses, batch_residuals)
+		# 	with torch.no_grad():
+		# 		out = renderer.render_batch(batch_poses_opt, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+		# 	# batch_image = (out["image"].detach().cpu().numpy() * 255).astype(np.uint8)
+		# 	batch_image = out["image"].detach().cpu()
+		# 	vis_img_optimized[j*chunk_size:(j+1)*chunk_size] = batch_image
+		# indices = np.argsort(loss_MSE_grid)
+		# padding = (pose_grid.shape[0] // 10 + int(pose_grid.shape[0]%10 > 0)) * 10 - pose_grid.shape[0]
+		# grid = vis_img[indices].permute(0, 3, 1, 2).contiguous()
+		# padded_gird = torch.cat([grid, torch.ones(padding, 3, 256, 256)], dim=0)
+		# padded_gird = padded_gird.view((padding + pose_grid.shape[0]) // 10, 10, 3, 256, 256).permute(2, 0, 3, 1, 4)
+		# padded_gird = padded_gird.reshape(3, -1, 2560)
+		# output_path = os.path.join(save_path, f'vis1_candidates_{i}.png')
+		# save_image(padded_gird, output_path)
+		# grid = vis_img_optimized[indices].permute(0, 3, 1, 2).contiguous()
+		# padded_gird = torch.cat([grid, torch.ones(padding, 3, 256, 256)], dim=0)
+		# padded_gird = padded_gird.view((padding + pose_grid.shape[0]) // 10, 10, 3, 256, 256).permute(2, 0, 3, 1, 4)
+		# padded_gird = padded_gird.reshape(3, -1, 2560)
+		# output_path = os.path.join(save_path, f'vis1_optimized_candidates_{i}.png')
+		# save_image(padded_gird, output_path)
+		beta = 0.1
+		indices = np.argsort(loss_MSE_grid)[:max(int(loss_MSE_grid.shape[0] * beta), 64)]
+		batch_poses = poses[indices]
+		batch_residuals = SE3.exp(learnable_cam_params[indices].detach()).as_matrix() # [5760, 4, 4]
+		poses = torch.bmm(batch_poses, batch_residuals) # [216, 4, 4]
+		poses = poses.repeat(4, 1, 1)
+		learnable_cam_params = torch.randn(poses.shape[0], 6) * 1e-1
+		learnable_cam_params.requires_grad_()
+		optimizer.param_groups = []
+		optimizer.add_param_group({'params': learnable_cam_params})
+		perspectives = torch.from_numpy(cameras[i].perspective).expand(poses.shape[0], -1, -1)
+		loss_MSE_grid = np.zeros(poses.shape[0])
+		num_batches = poses.shape[0] // chunk_size + int(poses.shape[0]%chunk_size > 0)
+		for j in tqdm.tqdm(range(num_batches)):
+			batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+			batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+			optimizer.param_groups[0]['lr'] = 1e-3
+			for k in tqdm.tqdm(range(s2_steps)):
+				batch_residuals = SE3.exp(learnable_cam_params[j*chunk_size:(j+1)*chunk_size]).as_matrix() # [5760, 4, 4]
+				batch_poses_opt = torch.bmm(batch_poses, batch_residuals)
+				out = renderer.render_batch(batch_poses_opt, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+				pred_tensor = out["image"]
+				valid_mask = (out["alpha"] > 0) & (out["viewcos"] > 0.5)  # (500, 256, 256, 1)
+				# batch_image = (out["image"].detach().cpu().numpy() * 255).astype(np.uint8)
+				# del batch_pose, batch_perspective
+				if k == s2_steps - 1:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					# loss += F.mse_loss(valid_mask, gt_mask_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					loss_MSE_grid[j*chunk_size:(j+1)*chunk_size] = loss.detach().cpu().numpy()
+					loss = loss.mean()
+				else:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='mean')
+				loss.backward()
+				optimizer.step()
+				optimizer.zero_grad()
+				scheduler.step()
+		beta = 0.1
+		indices = np.argsort(loss_MSE_grid)[:max(int(loss_MSE_grid.shape[0] * beta), 64)]
+		batch_poses = poses[indices]
+		batch_residuals = SE3.exp(learnable_cam_params[indices].detach()).as_matrix() # [5760, 4, 4]
+		poses = torch.bmm(batch_poses, batch_residuals) # [216, 4, 4]
+		poses = poses.repeat(4, 1, 1)
+		learnable_cam_params = torch.randn(poses.shape[0], 6) * 1e-2
+		learnable_cam_params.requires_grad_()
+		optimizer.param_groups = []
+		optimizer.add_param_group({'params': learnable_cam_params})
+		perspectives = torch.from_numpy(cameras[i].perspective).expand(poses.shape[0], -1, -1)
+		loss_MSE_grid = np.zeros(poses.shape[0])
+		num_batches = poses.shape[0] // chunk_size + int(poses.shape[0]%chunk_size > 0)
+		for j in tqdm.tqdm(range(num_batches)):
+			batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+			batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+			optimizer.param_groups[0]['lr'] = 1e-3
+			for k in tqdm.tqdm(range(s2_steps)):
+				batch_residuals = SE3.exp(learnable_cam_params[j*chunk_size:(j+1)*chunk_size]).as_matrix() # [5760, 4, 4]
+				batch_poses_opt = torch.bmm(batch_poses, batch_residuals)
+				out = renderer.render_batch(batch_poses_opt, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+				pred_tensor = out["image"]
+				valid_mask = (out["alpha"] > 0) & (out["viewcos"] > 0.5)  # (500, 256, 256, 1)
+				if k == s2_steps - 1:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					# loss += F.mse_loss(valid_mask, gt_mask_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					loss_MSE_grid[j*chunk_size:(j+1)*chunk_size] = loss.detach().cpu().numpy()
+					loss = loss.mean()
+				else:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='mean')
+				loss.backward()
+				optimizer.step()
+				optimizer.zero_grad()
+				scheduler.step()
+		pose_grid = poses
+		loss_LPIPS_grid = np.zeros(poses.shape[0])
+		chunk_size = 64
+		gt_tensor = gt_tensor.permute(0, 3, 1, 2).contiguous()
+		vis_img_opt = np.zeros((pose_grid.shape[0], 256, 256, 3), dtype=np.uint8)
+		num_batches = pose_grid.shape[0] // chunk_size + int(pose_grid.shape[0]%chunk_size > 0)
+		for j in tqdm.tqdm(range(num_batches)):
+			batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+			batch_residuals = SE3.exp(learnable_cam_params[j*chunk_size:(j+1)*chunk_size]).as_matrix() # [5760, 4, 4]
+			batch_poses_opt = torch.bmm(batch_poses, batch_residuals)
+			batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+			with torch.no_grad():
+				out = renderer.render_batch(batch_poses_opt, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+			batch_image = (out["image"].detach().cpu().numpy() * 255).astype(np.uint8)
+			vis_img_opt[j*chunk_size:(j+1)*chunk_size] = batch_image
+			pred_tensor = out["image"].permute(0, 3, 1, 2).contiguous()
+			with torch.no_grad():
+				loss_LPIPS_grid[j*chunk_size:(j+1)*chunk_size] = lpips_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1)).squeeze().cpu().numpy()
+		# indices_of_smallest = np.argsort(loss_MSE_grid)[:15]
+		indices1 = np.argsort(loss_MSE_grid)
+		indices2 = np.argsort(loss_LPIPS_grid)
+		ranks1 = np.zeros_like(loss_MSE_grid)
+		ranks2 = np.zeros_like(loss_LPIPS_grid)
+		ranks1[indices1] = np.arange(1, loss_MSE_grid.size + 1)
+		ranks2[indices2] = np.arange(1, loss_LPIPS_grid.size + 1)
+		total_ranks = ranks1 + ranks2
+		indices_of_smallest = np.argsort(total_ranks)[:15]
+		index = indices_of_smallest[0]
+		residual = SE3.exp(learnable_cam_params[index].detach()).as_matrix() # [5760, 4, 4]
+		c2w = poses[index] @ residual
+		w2c = torch.inverse(c2w)
+		w2c[1:3, :] *= -1 # OpenCV to OpenGL
+		w2c[:2, :] *= -1 # PyTorch3D to OpenCV
+		data[list(data.keys())[i]]["R"] = w2c[:3, :3].T.tolist()
+		data[list(data.keys())[i]]["T"] = w2c[:3, 3].tolist()
+		num_frames = 16
+		cmap = plt.get_cmap("hot")
+		num_rows = 2
+		num_cols = 8
+		# plt.subplots_adjust(top=0.2)
+		figsize = (num_cols * 2, num_rows * 2.4)
+		fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize)
+		fig.suptitle(f"Input Image v.s. Top 15 Similar Renderings", fontsize=16, y=0.93)
+		plt.subplots_adjust(top=0.9)
+		axs = axs.flatten()
+		for idx in range(num_rows * num_cols):
+			if idx < num_frames:
+				if idx == 0:
+					axs[idx].imshow(gt_img.reshape(256, 256, 3))
+					axs[idx].set_xlabel(f'Input Image', fontsize=10)
+				else:
+					axs[idx].imshow(vis_img_opt[indices_of_smallest[idx-1]].reshape(256, 256, 3))
+					loss_text = f"MSE: {loss_MSE_grid[indices_of_smallest[idx-1]]:.4f}_{int(ranks1[indices_of_smallest[idx-1]]):d}\nLPIPS: {loss_LPIPS_grid[indices_of_smallest[idx-1]]:.4f}_{int(ranks2[indices_of_smallest[idx-1]]):d}"
+					axs[idx].text(0.05, 0.95, loss_text, color='black', fontsize=8,
+								  ha='left', va='top', transform=axs[idx].transAxes)
+				for s in ["bottom", "top", "left", "right"]:
+					if idx == 0:
+						axs[idx].spines[s].set_color("green")
+					else:
+						axs[idx].spines[s].set_color(cmap(0.8 * idx / (num_frames)))
+					axs[idx].spines[s].set_linewidth(5)
+				axs[idx].set_xticks([])
+				axs[idx].set_yticks([])
+				# if i >= args.all_views:
+				#     axs[i].set_xlabel(f'MSE: {mse_losses[i%args.all_views]:.4f}\nLPIPS: {lpips_losses[i%args.all_views]:.4f}', fontsize=10)
+			else:
+				axs[i].axis("off")
+		plt.tight_layout()
+		output_path = os.path.join(save_path, f'vis_{i}_render_and_compare.png')
+		plt.savefig(output_path)  # Save the figure to a file
+		plt.close(fig)
+		print(f"Visualization file written to {output_path}")
+	del lpips_loss, renderer, learnable_cam_params
+	gc.collect()
+	torch.cuda.empty_cache()
+	return data
+def align_to_mesh(camera_data, mesh_path, save_path, num_views=8):
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--object', type=str, help="path to mesh (obj, ply, glb, ...)")
+	parser.add_argument('--path', type=str, help="path to mesh (obj, ply, glb, ...)")
+	parser.add_argument('--front_dir', type=str, default='+z', help="mesh front-facing dir")
+	parser.add_argument('--mode', default='albedo', type=str, choices=['lambertian', 'albedo', 'normal', 'depth'], help="rendering mode")
+	parser.add_argument('--W', type=int, default=256, help="GUI width")
+	parser.add_argument('--H', type=int, default=256, help="GUI height")
+	parser.add_argument("--wogui", type=bool, default=True, help="disable all dpg GUI")
+	parser.add_argument("--force_cuda_rast", action='store_true', help="force to use RasterizeCudaContext.")
+	parser.add_argument("--config", default='configs/navi.yaml', help="path to the yaml config file")
+	parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
+	parser.add_argument('--fovy', type=float, default=49.1, help="default GUI camera fovy")
+	args, extras = parser.parse_known_args()
+	# override default config from cli
+	opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+	data = camera_data
+	opt.mesh = mesh_path
+	opt.trainable_texture = False
+	renderer = Renderer(opt).to(torch.device("cuda"))
+	cameras = [CustomCamera(cam_params) for cam_params in data.values()]
+	# cams = [(cam.c2w, cam.perspective, cam.focal_length) for cam in cameras]
+	img_paths = [v["filepath"] for k, v in data.items()]
+	flags = [int(v["flag"]) for k, v in data.items()]
+	s1_steps = 128
+	num_hypotheses = 64
+	chunk_size = 512
+	print("Number of hypotheses:", num_hypotheses)
+	for i in tqdm.tqdm(range(num_views)):
+		if flags[i]:
+			continue
+		loss_MSE_grid = np.zeros(num_hypotheses)
+		vis_img_opt = torch.zeros(num_hypotheses, 256, 256, 3)
+		poses = torch.from_numpy(cameras[i].c2w).expand(num_hypotheses, -1, -1)
+		perspectives = torch.from_numpy(cameras[i].perspective).expand(num_hypotheses, -1, -1)
+		learnable_cam_params = torch.randn(num_hypotheses, 6) * 1e-3
+		learnable_cam_params.requires_grad_()
+		img_path = img_paths[i]
+		gt_img = Image.open(img_path)
+		if gt_img.mode == 'RGBA':
+			gt_img = np.asarray(gt_img, dtype=np.uint8).copy()
+			gt_mask = (gt_img[..., 3:] > 128).astype(np.float32)
+			gt_img[gt_img[:, :, -1] <= 255*0.9] = [255., 255., 255., 255.] # thresholding background
+			gt_img = gt_img[:, :, :3]
+		gt_tensor = torch.from_numpy(gt_img).float().unsqueeze(0).cuda() / 255.
+		gt_mask_tensor = torch.from_numpy(gt_mask).float().unsqueeze(0).cuda()
+		num_batches = num_hypotheses // chunk_size + int(num_hypotheses%chunk_size > 0)
+		l = [{'params': learnable_cam_params, 'lr': 5e-3, "name": "cam_params"}]
+		optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
+		scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
+		init_lr = optimizer.param_groups[0]['lr']
+		for j in tqdm.tqdm(range(num_batches)):
+			batch_poses = poses[j*chunk_size:(j+1)*chunk_size]
+			batch_perspectives = perspectives[j*chunk_size:(j+1)*chunk_size]
+			optimizer.param_groups[0]['lr'] = init_lr
+			for k in tqdm.tqdm(range(s1_steps)):
+				batch_residuals = SE3.exp(learnable_cam_params[j*chunk_size:(j+1)*chunk_size]).as_matrix() # [5760, 4, 4]
+				batch_poses_opt = torch.bmm(batch_poses, batch_residuals)
+				out = renderer.render_batch(batch_poses_opt, batch_perspectives, 256, 256, ssaa=1)  # (500, 256, 256, 3)
+				pred_tensor = out["image"]
+				if k == s1_steps - 1:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					# loss += F.mse_loss(valid_mask, gt_mask_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='none').mean(dim=(1, 2, 3))
+					loss_MSE_grid[j*chunk_size:(j+1)*chunk_size] = loss.detach().cpu().numpy()
+					batch_image = pred_tensor.detach().cpu()
+					vis_img_opt[j*chunk_size:(j+1)*chunk_size] = batch_image
+					loss = loss.mean()
+				else:
+					loss = F.mse_loss(pred_tensor, gt_tensor.expand(pred_tensor.shape[0], -1, -1, -1), reduction='mean')
+				loss.backward()
+				optimizer.step()
+				optimizer.zero_grad()
+				scheduler.step()
+		indices = np.argsort(loss_MSE_grid)
+		residual = SE3.exp(learnable_cam_params[indices[0]].detach()).as_matrix() # [5760, 4, 4]
+		c2w = torch.from_numpy(cameras[i].c2w) @ residual
+		w2c = torch.inverse(c2w)
+		w2c[1:3, :] *= -1 # OpenCV to OpenGL
+		w2c[:2, :] *= -1 # PyTorch3D to OpenCV
+		data[list(data.keys())[i]]["R"] = w2c[:3, :3].T.tolist()
+		data[list(data.keys())[i]]["T"] = w2c[:3, 3].tolist()
+		grid = vis_img_opt[indices].permute(0, 3, 1, 2).contiguous()
+		grid = grid.view(8, 8, 3, 256, 256).permute(2, 0, 3, 1, 4)
+		grid = grid.reshape(3, -1, int(256*8))
+		output_path = os.path.join(save_path, f'vis_aligned_candidates_{i}.png')
+		save_image(grid, output_path)
+	return data

sparseags/sh_utils.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#  Copyright 2021 The PlenOctree Authors.
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+import torch
+C0 = 0.28209479177387814
+C1 = 0.4886025119029199
+C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396
+]
+C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435
+]
+C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+def eval_sh(deg, sh, dirs):
+    """
+    Evaluate spherical harmonics at unit directions
+    using hardcoded SH polynomials.
+    Works with torch/np/jnp.
+    ... Can be 0 or more batch dimensions.
+    Args:
+        deg: int SH deg. Currently, 0-3 supported
+        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
+        dirs: jnp.ndarray unit directions [..., 3]
+    Returns:
+        [..., C]
+    """
+    assert deg <= 4 and deg >= 0
+    coeff = (deg + 1) ** 2
+    assert sh.shape[-1] >= coeff
+    result = C0 * sh[..., 0]
+    if deg > 0:
+        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
+        result = (result -
+                C1 * y * sh[..., 1] +
+                C1 * z * sh[..., 2] -
+                C1 * x * sh[..., 3])
+        if deg > 1:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result = (result +
+                    C2[0] * xy * sh[..., 4] +
+                    C2[1] * yz * sh[..., 5] +
+                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
+                    C2[3] * xz * sh[..., 7] +
+                    C2[4] * (xx - yy) * sh[..., 8])
+            if deg > 2:
+                result = (result +
+                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
+                C3[1] * xy * z * sh[..., 10] +
+                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
+                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
+                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
+                C3[5] * z * (xx - yy) * sh[..., 14] +
+                C3[6] * x * (xx - 3 * yy) * sh[..., 15])
+                if deg > 3:
+                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
+                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
+                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
+                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
+                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
+                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
+                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
+                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
+                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
+    return result
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+def SH2RGB(sh):
+    return sh * C0 + 0.5

sparseags/visual_utils.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import re
+import cv2
+import csv
+import json
+import math
+import tqdm
+import shutil
+import argparse
+import numpy as np
+from PIL import Image
+from omegaconf import OmegaConf
+import matplotlib.pyplot as plt
+import torch
+import torch.nn.functional as F
+import nvdiffrast.torch as dr
+from kiui.mesh import Mesh
+from kiui.cam import OrbitCamera
+from kiui.op import safe_normalize
+from kiui.lpips import LPIPS
+import sys
+from sparseags.mesh_utils.mesh_renderer import Renderer
+from sparseags.cam_utils import orbit_camera, OrbitCamera
+from sparseags.render_utils.gs_renderer import CustomCamera
+class GUI:
+	def __init__(self, opt):
+		self.opt = opt
+		self.W = opt.W
+		self.H = opt.H
+		self.wogui = opt.wogui # disable gui and run in cmd
+		self.cam = OrbitCamera(opt.W, opt.H, r=opt.radius, fovy=opt.fovy)
+		self.bg_color = torch.ones(3, dtype=torch.float32).cuda() # default white bg
+		# self.bg_color = torch.zeros(3, dtype=torch.float32).cuda() # black bg
+		self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
+		self.need_update = True # camera moved, should reset accumulation
+		self.light_dir = np.array([0, 0])
+		self.ambient_ratio = 0.5
+		# auto-rotate
+		self.auto_rotate_cam = False
+		self.auto_rotate_light = False
+		self.mode = opt.mode
+		self.render_modes = ['albedo', 'depth', 'normal', 'lambertian']
+		# load mesh
+		self.mesh = Mesh.load(opt.mesh, front_dir=opt.front_dir)
+		if not opt.force_cuda_rast and (self.wogui or os.name == 'nt'):
+			self.glctx = dr.RasterizeGLContext()
+		else:
+			self.glctx = dr.RasterizeCudaContext()
+	def step(self):
+		if not self.need_update:
+			return
+		starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+		starter.record()
+		# do MVP for vertices
+		pose = torch.from_numpy(self.cam.pose.astype(np.float32)).cuda()
+		proj = torch.from_numpy(self.cam.perspective.astype(np.float32)).cuda()
+		v_cam = torch.matmul(F.pad(self.mesh.v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
+		v_clip = v_cam @ proj.T
+		rast, rast_db = dr.rasterize(self.glctx, v_clip, self.mesh.f, (self.H, self.W))
+		alpha = (rast[..., 3:] > 0).float()
+		alpha = dr.antialias(alpha, rast, v_clip, self.mesh.f).squeeze(0).clamp(0, 1) # [H, W, 3]
+		if self.mode == 'depth':
+			depth, _ = dr.interpolate(-v_cam[..., [2]], rast, self.mesh.f) # [1, H, W, 1]
+			depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-20)
+			buffer = depth.squeeze(0).detach().cpu().numpy().repeat(3, -1) # [H, W, 3]
+		else:
+			# use vertex color if exists
+			if self.mesh.vc is not None:
+				albedo, _ = dr.interpolate(self.mesh.vc.unsqueeze(0).contiguous(), rast, self.mesh.f)
+			# use texture image
+			else:
+				texc, _ = dr.interpolate(self.mesh.vt.unsqueeze(0).contiguous(), rast, self.mesh.ft)
+				albedo = dr.texture(self.mesh.albedo.unsqueeze(0), texc, filter_mode='linear') # [1, H, W, 3]
+			albedo = torch.where(rast[..., 3:] > 0, albedo, torch.tensor(0).to(albedo.device)) # remove background
+			albedo = dr.antialias(albedo, rast, v_clip, self.mesh.f).clamp(0, 1) # [1, H, W, 3]
+			if self.mode == 'albedo':
+				albedo = albedo * alpha + self.bg_color * (1 - alpha)
+				buffer = albedo[0].detach().cpu().numpy()
+			else:
+				normal, _ = dr.interpolate(self.mesh.vn.unsqueeze(0).contiguous(), rast, self.mesh.fn)
+				normal = safe_normalize(normal)
+				if self.mode == 'normal':
+					normal_image = (normal[0] + 1) / 2
+					normal_image = torch.where(rast[..., 3:] > 0, normal_image, torch.tensor(1).to(normal_image.device)) # remove background
+					buffer = normal_image.detach().cpu().numpy()
+				elif self.mode == 'lambertian':
+					light_d = np.deg2rad(self.light_dir)
+					light_d = np.array([
+						np.cos(light_d[0]) * np.sin(light_d[1]),
+						-np.sin(light_d[0]),
+						np.cos(light_d[0]) * np.cos(light_d[1]),
+					], dtype=np.float32)
+					light_d = torch.from_numpy(light_d).to(albedo.device)
+					lambertian = self.ambient_ratio + (1 - self.ambient_ratio)  * (normal @ light_d).float().clamp(min=0)
+					albedo = (albedo * lambertian.unsqueeze(-1)) * alpha + self.bg_color * (1 - alpha)
+					buffer = albedo[0].detach().cpu().numpy()
+		ender.record()
+		torch.cuda.synchronize()
+		t = starter.elapsed_time(ender)
+		self.render_buffer = buffer
+		self.need_update = False
+		if self.auto_rotate_cam:
+			self.cam.orbit(5, 0)
+			self.need_update = True
+		if self.auto_rotate_light:
+			self.light_dir[1] += 3
+			self.need_update = True
+def vis_output(camera_data, mesh_path=None, save_path=None, num_views=8):
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--front_dir', type=str, default='+z', help="mesh front-facing dir")
+	parser.add_argument('--mode', default='albedo', type=str, choices=['lambertian', 'albedo', 'normal', 'depth'], help="rendering mode")
+	parser.add_argument('--W', type=int, default=256, help="GUI width")
+	parser.add_argument('--H', type=int, default=256, help="GUI height")
+	parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
+	parser.add_argument('--fovy', type=float, default=49.1, help="default GUI camera fovy")
+	parser.add_argument("--wogui", type=bool, default=True, help="disable all dpg GUI")
+	parser.add_argument("--force_cuda_rast", action='store_true', help="force to use RasterizeCudaContext.")
+	parser.add_argument('--elevation', type=int, default=0, help="rendering elevation")
+	parser.add_argument('--save_video', type=str, default=None, help="path to save rendered video")
+	parser.add_argument('--idx', type=int, default=0, help="GUI height")
+	parser.add_argument('--config', default='configs/navi.yaml', type=str, help='Path to config directory, which contains image.yaml')
+	args, extras = parser.parse_known_args()
+	# override default config from cli
+	opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+	data = camera_data
+	cameras = [CustomCamera(cam_params) for cam_params in data.values()]
+	cams = [(cam.c2w, cam.perspective, cam.focal_length) for cam in cameras]
+	img_paths = [v["filepath"] for k, v in data.items()]
+	opt.mesh = mesh_path
+	opt.trainable_texture = False
+	renderer = Renderer(opt).to(torch.device("cuda"))
+	lpips_loss = LPIPS(net='vgg').cuda()
+	mse_losses = []
+	lpips_losses = []
+	flags = [int(v["flag"]) for k, v in data.items()]
+	images = np.zeros((2, num_views, 256, 256, 3), dtype=np.uint8)
+	for i in tqdm.tqdm(range(len(cams))):
+		img_path = img_paths[i]
+		img = Image.open(img_path)
+		if img.mode == 'RGBA':
+			img = np.asarray(img, dtype=np.uint8).copy()
+			img[img[:, :, -1] <= 255*0.9] = [255., 255., 255., 255.] # thresholding background
+			img = img[:, :, :3]
+		gt_tensor = torch.from_numpy(img).permute(2, 0, 1).float().unsqueeze(0).cuda() / 255.0
+		images[0, i] = img
+		with torch.no_grad():
+			out = renderer.render(*cams[i][:2], 256, 256, ssaa=1)
+		# rgb loss
+		image = (out["image"].detach().cpu().numpy() * 255).astype(np.uint8)
+		pred_tensor = out["image"].permute(2, 0, 1).float().unsqueeze(0).cuda()
+		# obj_scale = ((out["alpha"] > 0) & (out["viewcos"] > 0.5)).detach().sum().float()
+		obj_scale = (out["alpha"] > 0).detach().sum().float()
+		obj_scale /= 256 ** 2
+		images[1, i] = image
+		with torch.no_grad():
+			mse_losses.append(F.mse_loss(pred_tensor, gt_tensor).squeeze().cpu().numpy() / obj_scale.item())
+			lpips_losses.append(lpips_loss(pred_tensor, gt_tensor).squeeze().cpu().numpy() / obj_scale.item())
+	mean_mse = np.mean(np.array(mse_losses)[:num_views])
+	mean_lpips = np.mean(np.array(lpips_losses)[:num_views])
+	num_frames = 2 * num_views
+	cmap = plt.get_cmap("hsv")
+	num_rows = 2
+	num_cols = num_views
+	plt.subplots_adjust(top=0.2)
+	figsize = (num_cols * 2, num_rows * 2.2)
+	fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize)
+	fig.suptitle(f"Avg MSE: {mean_mse:.4f}, Avg LPIPS: {mean_lpips:.4f}", fontsize=16, y=0.97)
+	axs = axs.flatten()
+	for i in range(num_rows * num_cols):
+		if i < num_frames:
+			axs[i].imshow(images.reshape(-1, 256, 256, 3)[i])
+			for s in ["bottom", "top", "left", "right"]:
+				if i % num_views <= num_views - 1:
+					if not flags[i%num_views]:
+						axs[i].spines[s].set_color("red")
+					else:
+						axs[i].spines[s].set_color("green")
+				else:
+					axs[i].spines[s].set_color(cmap(i / (num_frames)))
+				axs[i].spines[s].set_linewidth(5)
+			axs[i].set_xticks([])
+			axs[i].set_yticks([])
+			if i >= num_views:
+				axs[i].set_xlabel(f'MSE: {mse_losses[i%num_views]:.4f}\nLPIPS: {lpips_losses[i%num_views]:.4f}', fontsize=10)
+		else:
+			axs[i].axis("off")
+	plt.tight_layout()
+	plt.savefig(save_path)
+	plt.close(fig)
+	print(f"Visualization file written to {save_path}")
+	out_dir = save_path.replace('vis.png', 'reprojections')
+	os.makedirs(out_dir, exist_ok=True)
+	for i in range(num_views):
+		gt = Image.fromarray(images[0, i])
+		pred = Image.fromarray(images[1, i])
+		gt.save(os.path.join(out_dir, f"gt_{i}.png"))
+		pred.save(os.path.join(out_dir, f"pred_{i}.png"))
+	return np.array(lpips_losses), np.array(mse_losses)