|
|
|
|
|
|
|
|
|
|
|
|
|
import os.path as osp |
|
from glob import glob |
|
import itertools |
|
import numpy as np |
|
import re |
|
import cv2 |
|
import os |
|
import sys |
|
sys.path.append('/home/lipeng/ljh_code/Video_Depth_CVPR2025-main/dust3r_train') |
|
from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset |
|
from dust3r.utils.image import imread_cv2 |
|
TAG_FLOAT = 202021.25 |
|
def depth_read(filename): |
|
"""Read depth data from file, return as numpy array.""" |
|
f = open(filename, "rb") |
|
check = np.fromfile(f, dtype=np.float32, count=1)[0] |
|
assert ( |
|
check == TAG_FLOAT |
|
), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format( |
|
TAG_FLOAT, check |
|
) |
|
width = np.fromfile(f, dtype=np.int32, count=1)[0] |
|
height = np.fromfile(f, dtype=np.int32, count=1)[0] |
|
size = width * height |
|
assert ( |
|
width > 0 and height > 0 and size > 1 and size < 100000000 |
|
), " depth_read:: Wrong input size (width = {0}, height = {1}).".format( |
|
width, height |
|
) |
|
depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width)) |
|
return depth |
|
|
|
def cam_read(filename): |
|
""" Read camera data, return (M,N) tuple. |
|
|
|
M is the intrinsic matrix, N is the extrinsic matrix, so that |
|
|
|
x = M*N*X, |
|
with x being a point in homogeneous image pixel coordinates, X being a |
|
point in homogeneous world coordinates. |
|
""" |
|
f = open(filename,'rb') |
|
check = np.fromfile(f,dtype=np.float32,count=1)[0] |
|
assert check == TAG_FLOAT, ' cam_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? '.format(TAG_FLOAT,check) |
|
M = np.fromfile(f,dtype='float64',count=9).reshape((3,3)) |
|
N = np.fromfile(f,dtype='float64',count=12).reshape((3,4)) |
|
return M,N |
|
|
|
class SintelDatasets(BaseStereoViewDataset): |
|
def __init__(self, *args, split, ROOT, **kwargs): |
|
self.ROOT = ROOT |
|
super().__init__(*args, **kwargs) |
|
|
|
self.dataset_label = 'Sintel' |
|
test_scenes = [] |
|
|
|
scene_list = [] |
|
for scene in os.listdir(ROOT): |
|
scene_list.append(osp.join(ROOT, scene)) |
|
|
|
self.pair_dict = {} |
|
pair_num = 0 |
|
for scene in scene_list: |
|
imgs = sorted(glob(osp.join(scene, '*.png'))) |
|
|
|
len_imgs = len(imgs) |
|
|
|
|
|
combinations = [(i, j) for i, j in itertools.combinations(range(len_imgs), 2) if abs(i - j) <= 3] |
|
|
|
for (i, j) in combinations: |
|
self.pair_dict[pair_num] = [imgs[i], imgs[j]] |
|
pair_num += 1 |
|
|
|
def __len__(self): |
|
return len(self.pair_dict) |
|
|
|
|
|
def _get_views(self, idx, resolution, rng): |
|
|
|
views = [] |
|
for img_path in self.pair_dict[idx]: |
|
rgb_image = imread_cv2(img_path) |
|
|
|
depthmap_path = img_path.replace('MPI-Sintel-training_images', 'MPI-Sintel-depth-training').replace('final/','depth/').replace('.png','.dpt') |
|
mask_path = img_path.replace('MPI-Sintel-training_images', 'MPI-Sintel-depth-training').replace('final/','dynamic_label_perfect/') |
|
metadata_path = img_path.replace('MPI-Sintel-training_images', 'MPI-Sintel-depth-training').replace('final/','camdata_left/').replace('.png','.cam') |
|
|
|
pred_depth = np.load(img_path.replace('final','depth_prediction_' + self.depth_prior_name).replace('.png', '.npz')) |
|
focal_length_px = pred_depth['focallength_px'] |
|
pred_depth = pred_depth['depth'] |
|
pred_depth = self.pixel_to_pointcloud(pred_depth, focal_length_px) |
|
depthmap = depth_read(depthmap_path) |
|
if os.path.exists(mask_path): |
|
maskmap = imread_cv2(mask_path, cv2.IMREAD_UNCHANGED).astype(np.float32) |
|
maskmap = (maskmap / 255.0) > 0.1 |
|
|
|
|
|
depthmap *= maskmap |
|
intrinsics, extrinsics = cam_read(metadata_path) |
|
intrinsics, extrinsics = np.array(intrinsics, dtype=np.float32), np.array(extrinsics, dtype=np.float32) |
|
R = extrinsics[:3,:3] |
|
t = extrinsics[:3,3] |
|
camera_pose = np.eye(4, dtype=np.float32) |
|
camera_pose[:3,:3] = R.T |
|
camera_pose[:3,3] = -R.T @ t |
|
|
|
|
|
|
|
|
|
|
|
|
|
rgb_image, depthmap, pred_depth, intrinsics = self._crop_resize_if_necessary( |
|
rgb_image, depthmap, pred_depth, intrinsics, resolution, rng=rng, info=img_path) |
|
|
|
num_valid = (depthmap > 0.0).sum() |
|
|
|
|
|
|
|
views.append(dict( |
|
img=rgb_image, |
|
depthmap=depthmap, |
|
camera_pose=camera_pose, |
|
camera_intrinsics=intrinsics, |
|
dataset=self.dataset_label, |
|
label=img_path, |
|
instance=img_path, |
|
pred_depth=pred_depth |
|
)) |
|
return views |
|
|
|
|
|
if __name__ == "__main__": |
|
from dust3r.datasets.base.base_stereo_view_dataset import view_name |
|
from dust3r.viz import SceneViz, auto_cam_size |
|
from dust3r.utils.image import rgb |
|
|
|
dataset = SintelDatasets(split='train', ROOT="/data/lipeng/ljh_data/MPI-Sintel/MPI-Sintel/MPI-Sintel-training_images/training/final", resolution=512, aug_crop=16) |
|
|
|
a = len(dataset) |
|
for idx in np.random.permutation(len(dataset)): |
|
views = dataset[idx] |
|
assert len(views) == 2 |
|
print(view_name(views[0]), view_name(views[1])) |
|
viz = SceneViz() |
|
poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]] |
|
cam_size = max(auto_cam_size(poses), 0.001) |
|
for view_idx in [0, 1]: |
|
pts3d = views[view_idx]['pts3d'] |
|
valid_mask = views[view_idx]['valid_mask'] |
|
colors = rgb(views[view_idx]['img']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|