Spaces:

cyun9286
/

Align3R

Running on Zero

File size: 7,038 Bytes

f53b39e

# Copyright (C) 2024-present Naver Corporation. All rights reserved.
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
#
# --------------------------------------------------------
# Dataloader for Spring
# --------------------------------------------------------
import os.path as osp
from glob import glob
import itertools
import numpy as np
import re
import cv2
import os
import sys
sys.path.append('/home/lipeng/ljh_code/Video_Depth_CVPR2025-main/dust3r_train')
from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
from dust3r.utils.image import imread_cv2
TAG_FLOAT = 202021.25
def depth_read(filename):
    """Read depth data from file, return as numpy array."""
    f = open(filename, "rb")
    check = np.fromfile(f, dtype=np.float32, count=1)[0]
    assert (
        check == TAG_FLOAT
    ), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
        TAG_FLOAT, check
    )
    width = np.fromfile(f, dtype=np.int32, count=1)[0]
    height = np.fromfile(f, dtype=np.int32, count=1)[0]
    size = width * height
    assert (
        width > 0 and height > 0 and size > 1 and size < 100000000
    ), " depth_read:: Wrong input size (width = {0}, height = {1}).".format(
        width, height
    )
    depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width))
    return depth

def cam_read(filename):
    """ Read camera data, return (M,N) tuple.
    
    M is the intrinsic matrix, N is the extrinsic matrix, so that

    x = M*N*X,
    with x being a point in homogeneous image pixel coordinates, X being a
    point in homogeneous world coordinates.
    """
    f = open(filename,'rb')
    check = np.fromfile(f,dtype=np.float32,count=1)[0]
    assert check == TAG_FLOAT, ' cam_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? '.format(TAG_FLOAT,check)
    M = np.fromfile(f,dtype='float64',count=9).reshape((3,3))
    N = np.fromfile(f,dtype='float64',count=12).reshape((3,4))
    return M,N

class SintelDatasets(BaseStereoViewDataset):
    def __init__(self, *args, split, ROOT, **kwargs):
        self.ROOT = ROOT                        # ROOT = "/media/8TB/tyhuang/video_depth/vkitti_2.0.3_proc"
        super().__init__(*args, **kwargs)

        self.dataset_label = 'Sintel'
        test_scenes = []

        scene_list = []
        for scene in os.listdir(ROOT):
            scene_list.append(osp.join(ROOT, scene))

        self.pair_dict = {}
        pair_num = 0
        for scene in scene_list:
            imgs = sorted(glob(osp.join(scene, '*.png')))

            len_imgs = len(imgs)
            # combinations = [(i, j) for i, j in itertools.combinations(range(len_imgs), 2)
            #                 if abs(i - j) <= 15 or (abs(i - j) <= 30 and abs(i - j) % 5 == 0)]
            combinations = [(i, j) for i, j in itertools.combinations(range(len_imgs), 2) if abs(i - j) <= 3]

            for (i, j) in combinations:
                self.pair_dict[pair_num] = [imgs[i], imgs[j]]
                pair_num += 1

    def __len__(self):
        return len(self.pair_dict)


    def _get_views(self, idx, resolution, rng):

        views = []
        for img_path in self.pair_dict[idx]:
            rgb_image = imread_cv2(img_path)

            depthmap_path = img_path.replace('MPI-Sintel-training_images', 'MPI-Sintel-depth-training').replace('final/','depth/').replace('.png','.dpt')
            mask_path = img_path.replace('MPI-Sintel-training_images', 'MPI-Sintel-depth-training').replace('final/','dynamic_label_perfect/')
            metadata_path = img_path.replace('MPI-Sintel-training_images', 'MPI-Sintel-depth-training').replace('final/','camdata_left/').replace('.png','.cam')
            
            pred_depth = np.load(img_path.replace('final','depth_prediction_' + self.depth_prior_name).replace('.png', '.npz'))#['depth']
            focal_length_px = pred_depth['focallength_px']
            pred_depth = pred_depth['depth']
            pred_depth = self.pixel_to_pointcloud(pred_depth, focal_length_px)
            depthmap = depth_read(depthmap_path)
            if os.path.exists(mask_path):
              maskmap = imread_cv2(mask_path, cv2.IMREAD_UNCHANGED).astype(np.float32)
              maskmap = (maskmap / 255.0) > 0.1
              #print(maskmap.max())
              #maskmap = maskmap * (depthmap<100)
              depthmap *= maskmap
            intrinsics, extrinsics = cam_read(metadata_path)
            intrinsics, extrinsics = np.array(intrinsics, dtype=np.float32), np.array(extrinsics, dtype=np.float32)
            R = extrinsics[:3,:3]
            t = extrinsics[:3,3]
            camera_pose = np.eye(4, dtype=np.float32)
            camera_pose[:3,:3] = R.T
            camera_pose[:3,3] = -R.T @ t
            #camera_pose = np.linalg.inv(camera_pose)
            # max_depth = np.float32(metadata['maximum_depth'])

            # depthmap = (depthmap.astype(np.float32) / 20.0)
            # camera_pose[:3, 3] /= 20.0
            # pred_depth = pred_depth/20.0
            rgb_image, depthmap, pred_depth, intrinsics = self._crop_resize_if_necessary(
                rgb_image, depthmap, pred_depth, intrinsics, resolution, rng=rng, info=img_path)

            num_valid = (depthmap > 0.0).sum()
            # assert num_valid > 0
            # if num_valid==0:
            #   depthmap +=0.001
            views.append(dict(
                img=rgb_image,
                depthmap=depthmap,
                camera_pose=camera_pose,
                camera_intrinsics=intrinsics,
                dataset=self.dataset_label,
                label=img_path,
                instance=img_path,
                pred_depth=pred_depth
            ))
        return views


if __name__ == "__main__":
    from dust3r.datasets.base.base_stereo_view_dataset import view_name
    from dust3r.viz import SceneViz, auto_cam_size
    from dust3r.utils.image import rgb

    dataset = SintelDatasets(split='train', ROOT="/data/lipeng/ljh_data/MPI-Sintel/MPI-Sintel/MPI-Sintel-training_images/training/final", resolution=512, aug_crop=16)

    a = len(dataset)
    for idx in np.random.permutation(len(dataset)):
        views = dataset[idx]
        assert len(views) == 2
        print(view_name(views[0]), view_name(views[1]))
        viz = SceneViz()
        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
        cam_size = max(auto_cam_size(poses), 0.001)
        for view_idx in [0, 1]:
            pts3d = views[view_idx]['pts3d']
            valid_mask = views[view_idx]['valid_mask']
            colors = rgb(views[view_idx]['img'])
        #     viz.add_pointcloud(pts3d, colors, valid_mask)
        #     viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
        #                    focal=views[view_idx]['camera_intrinsics'][0, 0],
        #                    color=(idx * 255, (1 - idx) * 255, 0),
        #                    image=colors,
        #                    cam_size=cam_size)
        # viz.show()